dircrawl 0.0.9 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dircrawl.rb +43 -6
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ba3c1a9f1c59a90f094781a6becc3f221e9cfbe4
|
4
|
+
data.tar.gz: 4bbbac56d9ee80eaf8e040b8133ae56f25d9da57
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e2af2bdcdec751ab8c35bacaebbd8f6bad4adf400c933e1a68828fd9312fbbd88923e6947a4c0fed377c3e4879c8e74a04c482286f45c18d37ed0022395ed73e
|
7
|
+
data.tar.gz: 61c5942ba31099659c5874da9850ae6f15441e1b5764b46cdee69e3329a4e57a4ee1b78c4bbb5130925d728e4178e0543f2d651b0da51d3742725bea0c3f8347
|
data/lib/dircrawl.rb
CHANGED
@@ -1,8 +1,11 @@
|
|
1
1
|
require 'json'
|
2
2
|
require 'pry'
|
3
|
+
require 'curb'
|
4
|
+
require 'selenium-webdriver'
|
5
|
+
require 'uri'
|
3
6
|
|
4
7
|
class DirCrawl
|
5
|
-
def initialize(path, output_dir, ignore_includes, save, process_block, include_block, extras_block, failure_mode, *args)
|
8
|
+
def initialize(path, output_dir, ignore_includes, save, process_block, include_block, extras_block, failure_mode, cm_hash, *args)
|
6
9
|
@path = path
|
7
10
|
@output_dir = output_dir
|
8
11
|
@ignore_includes = ignore_includes
|
@@ -12,6 +15,12 @@ class DirCrawl
|
|
12
15
|
@failure_mode = failure_mode
|
13
16
|
@output = Array.new
|
14
17
|
@save = save
|
18
|
+
|
19
|
+
# Handle crawler manager info
|
20
|
+
@cm_url = cm_hash[:crawler_manager_url] if cm_hash
|
21
|
+
@selector_id = cm_hash[:selector_id] if cm_hash
|
22
|
+
|
23
|
+
# Crawl
|
15
24
|
crawl_dir(path, *args)
|
16
25
|
end
|
17
26
|
|
@@ -36,12 +45,13 @@ class DirCrawl
|
|
36
45
|
def crawl_dir(dir, *args)
|
37
46
|
Dir.foreach(dir) do |file|
|
38
47
|
next if file == '.' or file == '..'
|
48
|
+
|
39
49
|
# Go to next dir
|
40
50
|
if File.directory?(dir+"/"+file)
|
41
51
|
crawl_dir(dir+"/"+file, *args)
|
42
52
|
|
43
53
|
# Process file
|
44
|
-
elsif !file.include?(@ignore_includes)
|
54
|
+
elsif !file.include?(@ignore_includes)
|
45
55
|
|
46
56
|
# Create Dirs
|
47
57
|
create_write_dirs(dir.gsub(@path, @output_dir))
|
@@ -53,7 +63,11 @@ class DirCrawl
|
|
53
63
|
end
|
54
64
|
|
55
65
|
# Process Main
|
56
|
-
|
66
|
+
if !File.exist?(get_write_dir(dir, file))
|
67
|
+
processed = @process_block.call(dir+"/"+file, *args)
|
68
|
+
else
|
69
|
+
processed = File.read(get_write_dir(dir, file))
|
70
|
+
end
|
57
71
|
|
58
72
|
rescue Exception => e # really catch any failures
|
59
73
|
if @failure_mode == "debug"
|
@@ -64,9 +78,7 @@ class DirCrawl
|
|
64
78
|
end
|
65
79
|
|
66
80
|
# Only save in output if specified (to handle large dirs)
|
67
|
-
|
68
|
-
@output.push(JSON.parse(processed))
|
69
|
-
end
|
81
|
+
report_results([JSON.parse(processed)], dir+"/"+file)
|
70
82
|
|
71
83
|
# Write to file
|
72
84
|
File.write(get_write_dir(dir, file), processed)
|
@@ -74,6 +86,31 @@ class DirCrawl
|
|
74
86
|
end
|
75
87
|
end
|
76
88
|
|
89
|
+
# Figure out how to report results
|
90
|
+
def report_results(results, path)
|
91
|
+
if @cm_url
|
92
|
+
report_incremental(results, path)
|
93
|
+
else
|
94
|
+
report_batch(results)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
# Report all results in one JSON
|
99
|
+
def report_batch(results)
|
100
|
+
results.each do |result|
|
101
|
+
@output.push(result)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# Report results back to Harvester incrementally
|
106
|
+
def report_incremental(results, path)
|
107
|
+
curl_url = @cm_url+"/relay_results"
|
108
|
+
c = Curl::Easy.http_post(curl_url,
|
109
|
+
Curl::PostField.content('selector_id', @selector_id),
|
110
|
+
Curl::PostField.content('status_message', "Processed " + path),
|
111
|
+
Curl::PostField.content('results', JSON.pretty_generate(results)))
|
112
|
+
end
|
113
|
+
|
77
114
|
# Get the output array
|
78
115
|
def get_output
|
79
116
|
return JSON.pretty_generate(@output)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dircrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-
|
12
|
+
date: 2016-10-11 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: Run block on all files in dir
|
15
15
|
email: shidash@shidash.com
|
@@ -38,8 +38,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
38
38
|
version: '0'
|
39
39
|
requirements: []
|
40
40
|
rubyforge_project:
|
41
|
-
rubygems_version: 2.
|
41
|
+
rubygems_version: 2.4.8
|
42
42
|
signing_key:
|
43
43
|
specification_version: 4
|
44
44
|
summary: Run block on all files in dir
|
45
45
|
test_files: []
|
46
|
+
has_rdoc:
|