dircrawl 0.0.14 → 0.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/dircrawl.rb +55 -100
  3. metadata +3 -4
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 99a04d217d2ae8f295d5e7b71d702019128fa888
4
- data.tar.gz: 27c211fce81a7649ac03c77158c25889b42a47ad
3
+ metadata.gz: 042b2465ba3729dd8b32b61a6fa55a9732ce6bb0
4
+ data.tar.gz: 3c8bb1ef5cf0fffed384f91fdda387f41185ed97
5
5
  SHA512:
6
- metadata.gz: 092d8f4d0f1456bc83c9176ad82853f03b6272414881e946e12ed85730831434600ab1a59efe52ac4c55dcf2096ffb392ebf1ebc3ddb3583be01a3c55f2b23cf
7
- data.tar.gz: 661c0f676837b758e827a34060ce599dfe62f6bd211d206a0caca87078c4ba378009ec6c47253d30df0cad9f7a8e35feff620a0f4ed8471fca132d7f354dbd36
6
+ metadata.gz: 29b0dd91babcdf8600b2ac3c7600749986018e418706f539574c274dca51df7ca8e649dcd861dda1869ac2cde206d2fd1b45a4643d260055ad698bf8711efc9e
7
+ data.tar.gz: 4bda8db830eedc8f15e11f104874881034ddedc67434d5e86736cdd43ef5ea013cd9a21ff2566c7a857e8409265f3df25366e7f865479999a1090d79dbaef7fa
data/lib/dircrawl.rb CHANGED
@@ -1,47 +1,27 @@
1
1
  require 'json'
2
2
  require 'pry'
3
- require 'curb'
4
- require 'selenium-webdriver'
5
- require 'uri'
3
+ require 'harvesterreporter'
6
4
 
5
+ # Crawls a directory of files and runs a block of code on it
7
6
  class DirCrawl
8
- def initialize(path, output_dir, ignore_includes, save, process_block, include_block, extras_block, failure_mode, cm_hash, *args)
9
- @path = path
10
- @output_dir = output_dir
11
- @ignore_includes = ignore_includes
7
+ def initialize(path_params, process_block, include_block, extras_block, cm_hash, *args)
8
+ # Set the params for the path
9
+ @path = path_params[:path]
10
+ @output_dir = path_params[:output_dir]
11
+ @ignore_includes = path_params[:ignore_includes]
12
+ @failure_mode = path_params[:failure_mode]
13
+
14
+ # Setup the blocks to run
12
15
  include_block.call
13
16
  @process_block = process_block
14
17
  @extras_block = extras_block
15
- @failure_mode = failure_mode
16
- @output = Array.new
17
- @save = save
18
18
 
19
- # Handle crawler manager info
20
- @cm_url = cm_hash[:crawler_manager_url] if cm_hash
21
- @selector_id = cm_hash[:selector_id] if cm_hash
22
-
23
- # Crawl
24
- crawl_dir(path, *args)
19
+ # Setup the Harvester reporter to report the results
20
+ @reporter = HarvesterReporter.new(cm_hash)
21
+ crawl_dir(@path, *args)
25
22
  end
26
23
 
27
- # Figure out where to write it
28
- def get_write_dir(dir, file)
29
- dir_save = dir.gsub(@path, @output_dir)
30
- return dir_save+"/"+file+".json"
31
- end
32
-
33
- # Create if they don't exist
34
- def create_write_dirs(dir)
35
- dirs = dir.split("/")
36
- dirs.delete("")
37
- overallpath = ""
38
- dirs.each do |d|
39
- Dir.mkdir(overallpath+"/"+d) if !File.directory?(overallpath+"/"+d)
40
- overallpath += ("/"+d)
41
- end
42
- end
43
-
44
- # Crawl dir and call block for each file
24
+ # Crawls the directory sppecified
45
25
  def crawl_dir(dir, *args)
46
26
  Dir.foreach(dir) do |file|
47
27
  # Skip . or .. files
@@ -49,90 +29,65 @@ class DirCrawl
49
29
 
50
30
  # Recurse into directories
51
31
  if File.directory?(dir+"/"+file)
52
- report_status("Going to next directory: " + dir+"/"+file)
53
- crawl_dir(dir+"/"+file, *args)
32
+ crawl_dir("#{dir}/#{file}", *args)
54
33
 
55
34
  # Process file
56
35
  elsif !file.include?(@ignore_includes)
57
-
58
- # Create Output Directory
59
- create_write_dirs(dir.gsub(@path, @output_dir))
60
-
61
36
  begin
62
-
63
- # Check if processed file exists
64
- # Skip processing (if yes)
65
- if !File.exist?(get_write_dir(dir, file))
66
-
67
- # Process Extras
68
- if @extras_block != ""
69
- extras = @extras_block.call(@output_dir+"/")
70
- end
71
-
72
- # Now Process Main
73
- processed = @process_block.call(dir+"/"+file, *args)
74
- else
75
- puts "Processed file exists, skipping"
76
- puts " " + dir + "/" + file
77
- processed = File.read(get_write_dir(dir, file))
78
- end
79
-
80
- rescue Exception => e # really catch any failures
81
- report_status("Error on file "+file+": "+e.to_s)
82
- if @failure_mode == "debug"
83
- binding.pry
84
- elsif @failure_mode == "log"
85
- error_file = dir + "/" + file + "\n"
86
- IO.write(@output_dir+"/error_log.txt", error_file, mode: 'a')
87
- end
37
+ output_results(process_file(dir, file, *args), dir, file)
38
+ rescue Exception => e
39
+ handle_failure(e, dir, file, *args)
88
40
  end
89
-
90
- # Only save in output if specified (to handle large dirs)
91
- report_results([JSON.parse(processed)], dir+"/"+file)
92
-
93
- # Write Output to file
94
- File.write(get_write_dir(dir, file), processed)
95
41
  end
96
42
  end
97
43
  end
98
44
 
99
- # Figure out how to report results
100
- def report_results(results, path)
101
- if @cm_url
102
- report_incremental(results, path)
103
- else
104
- report_batch(results)
45
+ # Process a file using the blocks given
46
+ def process_file(dir, file, *args)
47
+ create_write_dirs(dir.gsub(@path, @output_dir))
48
+
49
+ # Run blocks to process the file
50
+ if !File.exist?(get_write_path(dir, file))
51
+ @extras_block.call("#{@output_dir}/") if !@extras_block.empty?
52
+ return @process_block.call("#{dir}/#{file}", *args)
53
+ else # Use already existing file
54
+ puts "Processed file exists, skipping: #{dir}/#{file}"
55
+ return File.read(get_write_path(dir, file))
105
56
  end
106
57
  end
107
58
 
108
- # Report all results in one JSON
109
- def report_batch(results)
110
- results.each do |result|
111
- @output.push(result)
112
- end
59
+ # Output the results to Harvester and file dir
60
+ def output_results(processed, dir, file)
61
+ @reporter.report_results([JSON.parse(processed)], "#{dir}/#{file}")
62
+ File.write(get_write_path(dir, file), processed)
113
63
  end
114
64
 
115
- # Report Harvester status message
116
- def report_status(status_msg)
117
- if @cm_url
118
- curl_url = @cm_url+"/update_status"
119
- c = Curl::Easy.http_post(curl_url,
120
- Curl::PostField.content('selector_id', @selector_id),
121
- Curl::PostField.content('status_message', status_msg))
65
+ # Create if they don't exist
66
+ def create_write_dirs(dir)
67
+ dirs = dir.split("/")
68
+ dirs.delete("")
69
+
70
+ # Go through and create all subdirs
71
+ overallpath = ""
72
+ dirs.each do |d|
73
+ Dir.mkdir(overallpath+"/"+d) if !File.directory?(overallpath+"/"+d)
74
+ overallpath += ("/"+d)
122
75
  end
123
76
  end
124
77
 
125
- # Report results back to Harvester incrementally
126
- def report_incremental(results, path)
127
- curl_url = @cm_url+"/relay_results"
128
- c = Curl::Easy.http_post(curl_url,
129
- Curl::PostField.content('selector_id', @selector_id),
130
- Curl::PostField.content('status_message', "Processed " + path),
131
- Curl::PostField.content('results', JSON.pretty_generate(results)))
78
+ # Figure out where to write the file
79
+ def get_write_path(dir, file)
80
+ dir_save = dir.gsub(@path, @output_dir)
81
+ return "#{dir_save}/#{file}.json"
132
82
  end
133
83
 
134
- # Get the output array
135
- def get_output
136
- return JSON.pretty_generate(@output)
84
+ # Handle different failure modes
85
+ def handle_failure(error, dir, file, *args)
86
+ if @failure_mode == "debug"
87
+ binding.pry
88
+ elsif @failure_mode == "log"
89
+ error_file = "#{dir}/#{file}\n"
90
+ IO.write(@output_dir+"/error_log.txt", error_file, mode: 'a')
91
+ end
137
92
  end
138
93
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dircrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.14
4
+ version: 0.0.15
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2017-01-19 00:00:00.000000000 Z
12
+ date: 2017-05-22 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: Run block on all files in dir
15
15
  email: shidash@shidash.com
@@ -38,9 +38,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
38
38
  version: '0'
39
39
  requirements: []
40
40
  rubyforge_project:
41
- rubygems_version: 2.4.8
41
+ rubygems_version: 2.6.11
42
42
  signing_key:
43
43
  specification_version: 4
44
44
  summary: Run block on all files in dir
45
45
  test_files: []
46
- has_rdoc: