dircrawl 0.0.14 → 0.0.15

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/dircrawl.rb +55 -100
  3. metadata +3 -4
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 99a04d217d2ae8f295d5e7b71d702019128fa888
4
- data.tar.gz: 27c211fce81a7649ac03c77158c25889b42a47ad
3
+ metadata.gz: 042b2465ba3729dd8b32b61a6fa55a9732ce6bb0
4
+ data.tar.gz: 3c8bb1ef5cf0fffed384f91fdda387f41185ed97
5
5
  SHA512:
6
- metadata.gz: 092d8f4d0f1456bc83c9176ad82853f03b6272414881e946e12ed85730831434600ab1a59efe52ac4c55dcf2096ffb392ebf1ebc3ddb3583be01a3c55f2b23cf
7
- data.tar.gz: 661c0f676837b758e827a34060ce599dfe62f6bd211d206a0caca87078c4ba378009ec6c47253d30df0cad9f7a8e35feff620a0f4ed8471fca132d7f354dbd36
6
+ metadata.gz: 29b0dd91babcdf8600b2ac3c7600749986018e418706f539574c274dca51df7ca8e649dcd861dda1869ac2cde206d2fd1b45a4643d260055ad698bf8711efc9e
7
+ data.tar.gz: 4bda8db830eedc8f15e11f104874881034ddedc67434d5e86736cdd43ef5ea013cd9a21ff2566c7a857e8409265f3df25366e7f865479999a1090d79dbaef7fa
data/lib/dircrawl.rb CHANGED
@@ -1,47 +1,27 @@
1
1
  require 'json'
2
2
  require 'pry'
3
- require 'curb'
4
- require 'selenium-webdriver'
5
- require 'uri'
3
+ require 'harvesterreporter'
6
4
 
5
+ # Crawls a directory of files and runs a block of code on it
7
6
  class DirCrawl
8
- def initialize(path, output_dir, ignore_includes, save, process_block, include_block, extras_block, failure_mode, cm_hash, *args)
9
- @path = path
10
- @output_dir = output_dir
11
- @ignore_includes = ignore_includes
7
+ def initialize(path_params, process_block, include_block, extras_block, cm_hash, *args)
8
+ # Set the params for the path
9
+ @path = path_params[:path]
10
+ @output_dir = path_params[:output_dir]
11
+ @ignore_includes = path_params[:ignore_includes]
12
+ @failure_mode = path_params[:failure_mode]
13
+
14
+ # Setup the blocks to run
12
15
  include_block.call
13
16
  @process_block = process_block
14
17
  @extras_block = extras_block
15
- @failure_mode = failure_mode
16
- @output = Array.new
17
- @save = save
18
18
 
19
- # Handle crawler manager info
20
- @cm_url = cm_hash[:crawler_manager_url] if cm_hash
21
- @selector_id = cm_hash[:selector_id] if cm_hash
22
-
23
- # Crawl
24
- crawl_dir(path, *args)
19
+ # Setup the Harvester reporter to report the results
20
+ @reporter = HarvesterReporter.new(cm_hash)
21
+ crawl_dir(@path, *args)
25
22
  end
26
23
 
27
- # Figure out where to write it
28
- def get_write_dir(dir, file)
29
- dir_save = dir.gsub(@path, @output_dir)
30
- return dir_save+"/"+file+".json"
31
- end
32
-
33
- # Create if they don't exist
34
- def create_write_dirs(dir)
35
- dirs = dir.split("/")
36
- dirs.delete("")
37
- overallpath = ""
38
- dirs.each do |d|
39
- Dir.mkdir(overallpath+"/"+d) if !File.directory?(overallpath+"/"+d)
40
- overallpath += ("/"+d)
41
- end
42
- end
43
-
44
- # Crawl dir and call block for each file
24
+ # Crawls the directory sppecified
45
25
  def crawl_dir(dir, *args)
46
26
  Dir.foreach(dir) do |file|
47
27
  # Skip . or .. files
@@ -49,90 +29,65 @@ class DirCrawl
49
29
 
50
30
  # Recurse into directories
51
31
  if File.directory?(dir+"/"+file)
52
- report_status("Going to next directory: " + dir+"/"+file)
53
- crawl_dir(dir+"/"+file, *args)
32
+ crawl_dir("#{dir}/#{file}", *args)
54
33
 
55
34
  # Process file
56
35
  elsif !file.include?(@ignore_includes)
57
-
58
- # Create Output Directory
59
- create_write_dirs(dir.gsub(@path, @output_dir))
60
-
61
36
  begin
62
-
63
- # Check if processed file exists
64
- # Skip processing (if yes)
65
- if !File.exist?(get_write_dir(dir, file))
66
-
67
- # Process Extras
68
- if @extras_block != ""
69
- extras = @extras_block.call(@output_dir+"/")
70
- end
71
-
72
- # Now Process Main
73
- processed = @process_block.call(dir+"/"+file, *args)
74
- else
75
- puts "Processed file exists, skipping"
76
- puts " " + dir + "/" + file
77
- processed = File.read(get_write_dir(dir, file))
78
- end
79
-
80
- rescue Exception => e # really catch any failures
81
- report_status("Error on file "+file+": "+e.to_s)
82
- if @failure_mode == "debug"
83
- binding.pry
84
- elsif @failure_mode == "log"
85
- error_file = dir + "/" + file + "\n"
86
- IO.write(@output_dir+"/error_log.txt", error_file, mode: 'a')
87
- end
37
+ output_results(process_file(dir, file, *args), dir, file)
38
+ rescue Exception => e
39
+ handle_failure(e, dir, file, *args)
88
40
  end
89
-
90
- # Only save in output if specified (to handle large dirs)
91
- report_results([JSON.parse(processed)], dir+"/"+file)
92
-
93
- # Write Output to file
94
- File.write(get_write_dir(dir, file), processed)
95
41
  end
96
42
  end
97
43
  end
98
44
 
99
- # Figure out how to report results
100
- def report_results(results, path)
101
- if @cm_url
102
- report_incremental(results, path)
103
- else
104
- report_batch(results)
45
+ # Process a file using the blocks given
46
+ def process_file(dir, file, *args)
47
+ create_write_dirs(dir.gsub(@path, @output_dir))
48
+
49
+ # Run blocks to process the file
50
+ if !File.exist?(get_write_path(dir, file))
51
+ @extras_block.call("#{@output_dir}/") if !@extras_block.empty?
52
+ return @process_block.call("#{dir}/#{file}", *args)
53
+ else # Use already existing file
54
+ puts "Processed file exists, skipping: #{dir}/#{file}"
55
+ return File.read(get_write_path(dir, file))
105
56
  end
106
57
  end
107
58
 
108
- # Report all results in one JSON
109
- def report_batch(results)
110
- results.each do |result|
111
- @output.push(result)
112
- end
59
+ # Output the results to Harvester and file dir
60
+ def output_results(processed, dir, file)
61
+ @reporter.report_results([JSON.parse(processed)], "#{dir}/#{file}")
62
+ File.write(get_write_path(dir, file), processed)
113
63
  end
114
64
 
115
- # Report Harvester status message
116
- def report_status(status_msg)
117
- if @cm_url
118
- curl_url = @cm_url+"/update_status"
119
- c = Curl::Easy.http_post(curl_url,
120
- Curl::PostField.content('selector_id', @selector_id),
121
- Curl::PostField.content('status_message', status_msg))
65
+ # Create if they don't exist
66
+ def create_write_dirs(dir)
67
+ dirs = dir.split("/")
68
+ dirs.delete("")
69
+
70
+ # Go through and create all subdirs
71
+ overallpath = ""
72
+ dirs.each do |d|
73
+ Dir.mkdir(overallpath+"/"+d) if !File.directory?(overallpath+"/"+d)
74
+ overallpath += ("/"+d)
122
75
  end
123
76
  end
124
77
 
125
- # Report results back to Harvester incrementally
126
- def report_incremental(results, path)
127
- curl_url = @cm_url+"/relay_results"
128
- c = Curl::Easy.http_post(curl_url,
129
- Curl::PostField.content('selector_id', @selector_id),
130
- Curl::PostField.content('status_message', "Processed " + path),
131
- Curl::PostField.content('results', JSON.pretty_generate(results)))
78
+ # Figure out where to write the file
79
+ def get_write_path(dir, file)
80
+ dir_save = dir.gsub(@path, @output_dir)
81
+ return "#{dir_save}/#{file}.json"
132
82
  end
133
83
 
134
- # Get the output array
135
- def get_output
136
- return JSON.pretty_generate(@output)
84
+ # Handle different failure modes
85
+ def handle_failure(error, dir, file, *args)
86
+ if @failure_mode == "debug"
87
+ binding.pry
88
+ elsif @failure_mode == "log"
89
+ error_file = "#{dir}/#{file}\n"
90
+ IO.write(@output_dir+"/error_log.txt", error_file, mode: 'a')
91
+ end
137
92
  end
138
93
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dircrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.14
4
+ version: 0.0.15
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2017-01-19 00:00:00.000000000 Z
12
+ date: 2017-05-22 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: Run block on all files in dir
15
15
  email: shidash@shidash.com
@@ -38,9 +38,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
38
38
  version: '0'
39
39
  requirements: []
40
40
  rubyforge_project:
41
- rubygems_version: 2.4.8
41
+ rubygems_version: 2.6.11
42
42
  signing_key:
43
43
  specification_version: 4
44
44
  summary: Run block on all files in dir
45
45
  test_files: []
46
- has_rdoc: