dircrawl 0.0.9 → 0.0.10

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/dircrawl.rb +43 -6
  3. metadata +4 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4b26892e32bab240fc3d19dbb81ea852705b0e02
4
- data.tar.gz: 07e70298e2ed813d8fcf28e9f84a61f11958734d
3
+ metadata.gz: ba3c1a9f1c59a90f094781a6becc3f221e9cfbe4
4
+ data.tar.gz: 4bbbac56d9ee80eaf8e040b8133ae56f25d9da57
5
5
  SHA512:
6
- metadata.gz: 51a3f508c3481d326c57cb1de331d91273ec5de8e3190074903e8bc9a19b3ffc8238b7c42ceb4eae2714bb789bb99f244464cb0c60b25a4bbc557a69a159299b
7
- data.tar.gz: 955daa23ad7dda13e86fdb32812c19adbe26614615265c89a921fa3dc49653d38fdbe623391e832fcd769d9138997e30ed6a411b856ea9a7faa01e75ecc0c9b8
6
+ metadata.gz: e2af2bdcdec751ab8c35bacaebbd8f6bad4adf400c933e1a68828fd9312fbbd88923e6947a4c0fed377c3e4879c8e74a04c482286f45c18d37ed0022395ed73e
7
+ data.tar.gz: 61c5942ba31099659c5874da9850ae6f15441e1b5764b46cdee69e3329a4e57a4ee1b78c4bbb5130925d728e4178e0543f2d651b0da51d3742725bea0c3f8347
data/lib/dircrawl.rb CHANGED
@@ -1,8 +1,11 @@
1
1
  require 'json'
2
2
  require 'pry'
3
+ require 'curb'
4
+ require 'selenium-webdriver'
5
+ require 'uri'
3
6
 
4
7
  class DirCrawl
5
- def initialize(path, output_dir, ignore_includes, save, process_block, include_block, extras_block, failure_mode, *args)
8
+ def initialize(path, output_dir, ignore_includes, save, process_block, include_block, extras_block, failure_mode, cm_hash, *args)
6
9
  @path = path
7
10
  @output_dir = output_dir
8
11
  @ignore_includes = ignore_includes
@@ -12,6 +15,12 @@ class DirCrawl
12
15
  @failure_mode = failure_mode
13
16
  @output = Array.new
14
17
  @save = save
18
+
19
+ # Handle crawler manager info
20
+ @cm_url = cm_hash[:crawler_manager_url] if cm_hash
21
+ @selector_id = cm_hash[:selector_id] if cm_hash
22
+
23
+ # Crawl
15
24
  crawl_dir(path, *args)
16
25
  end
17
26
 
@@ -36,12 +45,13 @@ class DirCrawl
36
45
  def crawl_dir(dir, *args)
37
46
  Dir.foreach(dir) do |file|
38
47
  next if file == '.' or file == '..'
48
+
39
49
  # Go to next dir
40
50
  if File.directory?(dir+"/"+file)
41
51
  crawl_dir(dir+"/"+file, *args)
42
52
 
43
53
  # Process file
44
- elsif !file.include?(@ignore_includes) && !File.exist?(get_write_dir(dir, file))
54
+ elsif !file.include?(@ignore_includes)
45
55
 
46
56
  # Create Dirs
47
57
  create_write_dirs(dir.gsub(@path, @output_dir))
@@ -53,7 +63,11 @@ class DirCrawl
53
63
  end
54
64
 
55
65
  # Process Main
56
- processed = @process_block.call(dir+"/"+file, *args)
66
+ if !File.exist?(get_write_dir(dir, file))
67
+ processed = @process_block.call(dir+"/"+file, *args)
68
+ else
69
+ processed = File.read(get_write_dir(dir, file))
70
+ end
57
71
 
58
72
  rescue Exception => e # really catch any failures
59
73
  if @failure_mode == "debug"
@@ -64,9 +78,7 @@ class DirCrawl
64
78
  end
65
79
 
66
80
  # Only save in output if specified (to handle large dirs)
67
- if @save
68
- @output.push(JSON.parse(processed))
69
- end
81
+ report_results([JSON.parse(processed)], dir+"/"+file)
70
82
 
71
83
  # Write to file
72
84
  File.write(get_write_dir(dir, file), processed)
@@ -74,6 +86,31 @@ class DirCrawl
74
86
  end
75
87
  end
76
88
 
89
+ # Figure out how to report results
90
+ def report_results(results, path)
91
+ if @cm_url
92
+ report_incremental(results, path)
93
+ else
94
+ report_batch(results)
95
+ end
96
+ end
97
+
98
+ # Report all results in one JSON
99
+ def report_batch(results)
100
+ results.each do |result|
101
+ @output.push(result)
102
+ end
103
+ end
104
+
105
+ # Report results back to Harvester incrementally
106
+ def report_incremental(results, path)
107
+ curl_url = @cm_url+"/relay_results"
108
+ c = Curl::Easy.http_post(curl_url,
109
+ Curl::PostField.content('selector_id', @selector_id),
110
+ Curl::PostField.content('status_message', "Processed " + path),
111
+ Curl::PostField.content('results', JSON.pretty_generate(results)))
112
+ end
113
+
77
114
  # Get the output array
78
115
  def get_output
79
116
  return JSON.pretty_generate(@output)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dircrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-08-15 00:00:00.000000000 Z
12
+ date: 2016-10-11 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: Run block on all files in dir
15
15
  email: shidash@shidash.com
@@ -38,8 +38,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
38
38
  version: '0'
39
39
  requirements: []
40
40
  rubyforge_project:
41
- rubygems_version: 2.5.1
41
+ rubygems_version: 2.4.8
42
42
  signing_key:
43
43
  specification_version: 4
44
44
  summary: Run block on all files in dir
45
45
  test_files: []
46
+ has_rdoc: