dircrawl 0.0.9 → 0.0.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/dircrawl.rb +43 -6
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ba3c1a9f1c59a90f094781a6becc3f221e9cfbe4
|
4
|
+
data.tar.gz: 4bbbac56d9ee80eaf8e040b8133ae56f25d9da57
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e2af2bdcdec751ab8c35bacaebbd8f6bad4adf400c933e1a68828fd9312fbbd88923e6947a4c0fed377c3e4879c8e74a04c482286f45c18d37ed0022395ed73e
|
7
|
+
data.tar.gz: 61c5942ba31099659c5874da9850ae6f15441e1b5764b46cdee69e3329a4e57a4ee1b78c4bbb5130925d728e4178e0543f2d651b0da51d3742725bea0c3f8347
|
data/lib/dircrawl.rb
CHANGED
@@ -1,8 +1,11 @@
|
|
1
1
|
require 'json'
|
2
2
|
require 'pry'
|
3
|
+
require 'curb'
|
4
|
+
require 'selenium-webdriver'
|
5
|
+
require 'uri'
|
3
6
|
|
4
7
|
class DirCrawl
|
5
|
-
def initialize(path, output_dir, ignore_includes, save, process_block, include_block, extras_block, failure_mode, *args)
|
8
|
+
def initialize(path, output_dir, ignore_includes, save, process_block, include_block, extras_block, failure_mode, cm_hash, *args)
|
6
9
|
@path = path
|
7
10
|
@output_dir = output_dir
|
8
11
|
@ignore_includes = ignore_includes
|
@@ -12,6 +15,12 @@ class DirCrawl
|
|
12
15
|
@failure_mode = failure_mode
|
13
16
|
@output = Array.new
|
14
17
|
@save = save
|
18
|
+
|
19
|
+
# Handle crawler manager info
|
20
|
+
@cm_url = cm_hash[:crawler_manager_url] if cm_hash
|
21
|
+
@selector_id = cm_hash[:selector_id] if cm_hash
|
22
|
+
|
23
|
+
# Crawl
|
15
24
|
crawl_dir(path, *args)
|
16
25
|
end
|
17
26
|
|
@@ -36,12 +45,13 @@ class DirCrawl
|
|
36
45
|
def crawl_dir(dir, *args)
|
37
46
|
Dir.foreach(dir) do |file|
|
38
47
|
next if file == '.' or file == '..'
|
48
|
+
|
39
49
|
# Go to next dir
|
40
50
|
if File.directory?(dir+"/"+file)
|
41
51
|
crawl_dir(dir+"/"+file, *args)
|
42
52
|
|
43
53
|
# Process file
|
44
|
-
elsif !file.include?(@ignore_includes)
|
54
|
+
elsif !file.include?(@ignore_includes)
|
45
55
|
|
46
56
|
# Create Dirs
|
47
57
|
create_write_dirs(dir.gsub(@path, @output_dir))
|
@@ -53,7 +63,11 @@ class DirCrawl
|
|
53
63
|
end
|
54
64
|
|
55
65
|
# Process Main
|
56
|
-
|
66
|
+
if !File.exist?(get_write_dir(dir, file))
|
67
|
+
processed = @process_block.call(dir+"/"+file, *args)
|
68
|
+
else
|
69
|
+
processed = File.read(get_write_dir(dir, file))
|
70
|
+
end
|
57
71
|
|
58
72
|
rescue Exception => e # really catch any failures
|
59
73
|
if @failure_mode == "debug"
|
@@ -64,9 +78,7 @@ class DirCrawl
|
|
64
78
|
end
|
65
79
|
|
66
80
|
# Only save in output if specified (to handle large dirs)
|
67
|
-
|
68
|
-
@output.push(JSON.parse(processed))
|
69
|
-
end
|
81
|
+
report_results([JSON.parse(processed)], dir+"/"+file)
|
70
82
|
|
71
83
|
# Write to file
|
72
84
|
File.write(get_write_dir(dir, file), processed)
|
@@ -74,6 +86,31 @@ class DirCrawl
|
|
74
86
|
end
|
75
87
|
end
|
76
88
|
|
89
|
+
# Figure out how to report results
|
90
|
+
def report_results(results, path)
|
91
|
+
if @cm_url
|
92
|
+
report_incremental(results, path)
|
93
|
+
else
|
94
|
+
report_batch(results)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
# Report all results in one JSON
|
99
|
+
def report_batch(results)
|
100
|
+
results.each do |result|
|
101
|
+
@output.push(result)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# Report results back to Harvester incrementally
|
106
|
+
def report_incremental(results, path)
|
107
|
+
curl_url = @cm_url+"/relay_results"
|
108
|
+
c = Curl::Easy.http_post(curl_url,
|
109
|
+
Curl::PostField.content('selector_id', @selector_id),
|
110
|
+
Curl::PostField.content('status_message', "Processed " + path),
|
111
|
+
Curl::PostField.content('results', JSON.pretty_generate(results)))
|
112
|
+
end
|
113
|
+
|
77
114
|
# Get the output array
|
78
115
|
def get_output
|
79
116
|
return JSON.pretty_generate(@output)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dircrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-
|
12
|
+
date: 2016-10-11 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: Run block on all files in dir
|
15
15
|
email: shidash@shidash.com
|
@@ -38,8 +38,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
38
38
|
version: '0'
|
39
39
|
requirements: []
|
40
40
|
rubyforge_project:
|
41
|
-
rubygems_version: 2.
|
41
|
+
rubygems_version: 2.4.8
|
42
42
|
signing_key:
|
43
43
|
specification_version: 4
|
44
44
|
summary: Run block on all files in dir
|
45
45
|
test_files: []
|
46
|
+
has_rdoc:
|