dircrawl 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/dircrawl.rb +43 -6
  3. metadata +4 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4b26892e32bab240fc3d19dbb81ea852705b0e02
4
- data.tar.gz: 07e70298e2ed813d8fcf28e9f84a61f11958734d
3
+ metadata.gz: ba3c1a9f1c59a90f094781a6becc3f221e9cfbe4
4
+ data.tar.gz: 4bbbac56d9ee80eaf8e040b8133ae56f25d9da57
5
5
  SHA512:
6
- metadata.gz: 51a3f508c3481d326c57cb1de331d91273ec5de8e3190074903e8bc9a19b3ffc8238b7c42ceb4eae2714bb789bb99f244464cb0c60b25a4bbc557a69a159299b
7
- data.tar.gz: 955daa23ad7dda13e86fdb32812c19adbe26614615265c89a921fa3dc49653d38fdbe623391e832fcd769d9138997e30ed6a411b856ea9a7faa01e75ecc0c9b8
6
+ metadata.gz: e2af2bdcdec751ab8c35bacaebbd8f6bad4adf400c933e1a68828fd9312fbbd88923e6947a4c0fed377c3e4879c8e74a04c482286f45c18d37ed0022395ed73e
7
+ data.tar.gz: 61c5942ba31099659c5874da9850ae6f15441e1b5764b46cdee69e3329a4e57a4ee1b78c4bbb5130925d728e4178e0543f2d651b0da51d3742725bea0c3f8347
data/lib/dircrawl.rb CHANGED
@@ -1,8 +1,11 @@
1
1
  require 'json'
2
2
  require 'pry'
3
+ require 'curb'
4
+ require 'selenium-webdriver'
5
+ require 'uri'
3
6
 
4
7
  class DirCrawl
5
- def initialize(path, output_dir, ignore_includes, save, process_block, include_block, extras_block, failure_mode, *args)
8
+ def initialize(path, output_dir, ignore_includes, save, process_block, include_block, extras_block, failure_mode, cm_hash, *args)
6
9
  @path = path
7
10
  @output_dir = output_dir
8
11
  @ignore_includes = ignore_includes
@@ -12,6 +15,12 @@ class DirCrawl
12
15
  @failure_mode = failure_mode
13
16
  @output = Array.new
14
17
  @save = save
18
+
19
+ # Handle crawler manager info
20
+ @cm_url = cm_hash[:crawler_manager_url] if cm_hash
21
+ @selector_id = cm_hash[:selector_id] if cm_hash
22
+
23
+ # Crawl
15
24
  crawl_dir(path, *args)
16
25
  end
17
26
 
@@ -36,12 +45,13 @@ class DirCrawl
36
45
  def crawl_dir(dir, *args)
37
46
  Dir.foreach(dir) do |file|
38
47
  next if file == '.' or file == '..'
48
+
39
49
  # Go to next dir
40
50
  if File.directory?(dir+"/"+file)
41
51
  crawl_dir(dir+"/"+file, *args)
42
52
 
43
53
  # Process file
44
- elsif !file.include?(@ignore_includes) && !File.exist?(get_write_dir(dir, file))
54
+ elsif !file.include?(@ignore_includes)
45
55
 
46
56
  # Create Dirs
47
57
  create_write_dirs(dir.gsub(@path, @output_dir))
@@ -53,7 +63,11 @@ class DirCrawl
53
63
  end
54
64
 
55
65
  # Process Main
56
- processed = @process_block.call(dir+"/"+file, *args)
66
+ if !File.exist?(get_write_dir(dir, file))
67
+ processed = @process_block.call(dir+"/"+file, *args)
68
+ else
69
+ processed = File.read(get_write_dir(dir, file))
70
+ end
57
71
 
58
72
  rescue Exception => e # really catch any failures
59
73
  if @failure_mode == "debug"
@@ -64,9 +78,7 @@ class DirCrawl
64
78
  end
65
79
 
66
80
  # Only save in output if specified (to handle large dirs)
67
- if @save
68
- @output.push(JSON.parse(processed))
69
- end
81
+ report_results([JSON.parse(processed)], dir+"/"+file)
70
82
 
71
83
  # Write to file
72
84
  File.write(get_write_dir(dir, file), processed)
@@ -74,6 +86,31 @@ class DirCrawl
74
86
  end
75
87
  end
76
88
 
89
+ # Figure out how to report results
90
+ def report_results(results, path)
91
+ if @cm_url
92
+ report_incremental(results, path)
93
+ else
94
+ report_batch(results)
95
+ end
96
+ end
97
+
98
+ # Report all results in one JSON
99
+ def report_batch(results)
100
+ results.each do |result|
101
+ @output.push(result)
102
+ end
103
+ end
104
+
105
+ # Report results back to Harvester incrementally
106
+ def report_incremental(results, path)
107
+ curl_url = @cm_url+"/relay_results"
108
+ c = Curl::Easy.http_post(curl_url,
109
+ Curl::PostField.content('selector_id', @selector_id),
110
+ Curl::PostField.content('status_message', "Processed " + path),
111
+ Curl::PostField.content('results', JSON.pretty_generate(results)))
112
+ end
113
+
77
114
  # Get the output array
78
115
  def get_output
79
116
  return JSON.pretty_generate(@output)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dircrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-08-15 00:00:00.000000000 Z
12
+ date: 2016-10-11 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: Run block on all files in dir
15
15
  email: shidash@shidash.com
@@ -38,8 +38,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
38
38
  version: '0'
39
39
  requirements: []
40
40
  rubyforge_project:
41
- rubygems_version: 2.5.1
41
+ rubygems_version: 2.4.8
42
42
  signing_key:
43
43
  specification_version: 4
44
44
  summary: Run block on all files in dir
45
45
  test_files: []
46
+ has_rdoc: