dircrawl 0.0.14 → 0.0.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dircrawl.rb +55 -100
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 042b2465ba3729dd8b32b61a6fa55a9732ce6bb0
|
4
|
+
data.tar.gz: 3c8bb1ef5cf0fffed384f91fdda387f41185ed97
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 29b0dd91babcdf8600b2ac3c7600749986018e418706f539574c274dca51df7ca8e649dcd861dda1869ac2cde206d2fd1b45a4643d260055ad698bf8711efc9e
|
7
|
+
data.tar.gz: 4bda8db830eedc8f15e11f104874881034ddedc67434d5e86736cdd43ef5ea013cd9a21ff2566c7a857e8409265f3df25366e7f865479999a1090d79dbaef7fa
|
data/lib/dircrawl.rb
CHANGED
@@ -1,47 +1,27 @@
|
|
1
1
|
require 'json'
|
2
2
|
require 'pry'
|
3
|
-
require '
|
4
|
-
require 'selenium-webdriver'
|
5
|
-
require 'uri'
|
3
|
+
require 'harvesterreporter'
|
6
4
|
|
5
|
+
# Crawls a directory of files and runs a block of code on it
|
7
6
|
class DirCrawl
|
8
|
-
def initialize(
|
9
|
-
|
10
|
-
@
|
11
|
-
@
|
7
|
+
def initialize(path_params, process_block, include_block, extras_block, cm_hash, *args)
|
8
|
+
# Set the params for the path
|
9
|
+
@path = path_params[:path]
|
10
|
+
@output_dir = path_params[:output_dir]
|
11
|
+
@ignore_includes = path_params[:ignore_includes]
|
12
|
+
@failure_mode = path_params[:failure_mode]
|
13
|
+
|
14
|
+
# Setup the blocks to run
|
12
15
|
include_block.call
|
13
16
|
@process_block = process_block
|
14
17
|
@extras_block = extras_block
|
15
|
-
@failure_mode = failure_mode
|
16
|
-
@output = Array.new
|
17
|
-
@save = save
|
18
18
|
|
19
|
-
#
|
20
|
-
@
|
21
|
-
@
|
22
|
-
|
23
|
-
# Crawl
|
24
|
-
crawl_dir(path, *args)
|
19
|
+
# Setup the Harvester reporter to report the results
|
20
|
+
@reporter = HarvesterReporter.new(cm_hash)
|
21
|
+
crawl_dir(@path, *args)
|
25
22
|
end
|
26
23
|
|
27
|
-
#
|
28
|
-
def get_write_dir(dir, file)
|
29
|
-
dir_save = dir.gsub(@path, @output_dir)
|
30
|
-
return dir_save+"/"+file+".json"
|
31
|
-
end
|
32
|
-
|
33
|
-
# Create if they don't exist
|
34
|
-
def create_write_dirs(dir)
|
35
|
-
dirs = dir.split("/")
|
36
|
-
dirs.delete("")
|
37
|
-
overallpath = ""
|
38
|
-
dirs.each do |d|
|
39
|
-
Dir.mkdir(overallpath+"/"+d) if !File.directory?(overallpath+"/"+d)
|
40
|
-
overallpath += ("/"+d)
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
# Crawl dir and call block for each file
|
24
|
+
# Crawls the directory sppecified
|
45
25
|
def crawl_dir(dir, *args)
|
46
26
|
Dir.foreach(dir) do |file|
|
47
27
|
# Skip . or .. files
|
@@ -49,90 +29,65 @@ class DirCrawl
|
|
49
29
|
|
50
30
|
# Recurse into directories
|
51
31
|
if File.directory?(dir+"/"+file)
|
52
|
-
|
53
|
-
crawl_dir(dir+"/"+file, *args)
|
32
|
+
crawl_dir("#{dir}/#{file}", *args)
|
54
33
|
|
55
34
|
# Process file
|
56
35
|
elsif !file.include?(@ignore_includes)
|
57
|
-
|
58
|
-
# Create Output Directory
|
59
|
-
create_write_dirs(dir.gsub(@path, @output_dir))
|
60
|
-
|
61
36
|
begin
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
if !File.exist?(get_write_dir(dir, file))
|
66
|
-
|
67
|
-
# Process Extras
|
68
|
-
if @extras_block != ""
|
69
|
-
extras = @extras_block.call(@output_dir+"/")
|
70
|
-
end
|
71
|
-
|
72
|
-
# Now Process Main
|
73
|
-
processed = @process_block.call(dir+"/"+file, *args)
|
74
|
-
else
|
75
|
-
puts "Processed file exists, skipping"
|
76
|
-
puts " " + dir + "/" + file
|
77
|
-
processed = File.read(get_write_dir(dir, file))
|
78
|
-
end
|
79
|
-
|
80
|
-
rescue Exception => e # really catch any failures
|
81
|
-
report_status("Error on file "+file+": "+e.to_s)
|
82
|
-
if @failure_mode == "debug"
|
83
|
-
binding.pry
|
84
|
-
elsif @failure_mode == "log"
|
85
|
-
error_file = dir + "/" + file + "\n"
|
86
|
-
IO.write(@output_dir+"/error_log.txt", error_file, mode: 'a')
|
87
|
-
end
|
37
|
+
output_results(process_file(dir, file, *args), dir, file)
|
38
|
+
rescue Exception => e
|
39
|
+
handle_failure(e, dir, file, *args)
|
88
40
|
end
|
89
|
-
|
90
|
-
# Only save in output if specified (to handle large dirs)
|
91
|
-
report_results([JSON.parse(processed)], dir+"/"+file)
|
92
|
-
|
93
|
-
# Write Output to file
|
94
|
-
File.write(get_write_dir(dir, file), processed)
|
95
41
|
end
|
96
42
|
end
|
97
43
|
end
|
98
44
|
|
99
|
-
#
|
100
|
-
def
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
45
|
+
# Process a file using the blocks given
|
46
|
+
def process_file(dir, file, *args)
|
47
|
+
create_write_dirs(dir.gsub(@path, @output_dir))
|
48
|
+
|
49
|
+
# Run blocks to process the file
|
50
|
+
if !File.exist?(get_write_path(dir, file))
|
51
|
+
@extras_block.call("#{@output_dir}/") if !@extras_block.empty?
|
52
|
+
return @process_block.call("#{dir}/#{file}", *args)
|
53
|
+
else # Use already existing file
|
54
|
+
puts "Processed file exists, skipping: #{dir}/#{file}"
|
55
|
+
return File.read(get_write_path(dir, file))
|
105
56
|
end
|
106
57
|
end
|
107
58
|
|
108
|
-
#
|
109
|
-
def
|
110
|
-
|
111
|
-
|
112
|
-
end
|
59
|
+
# Output the results to Harvester and file dir
|
60
|
+
def output_results(processed, dir, file)
|
61
|
+
@reporter.report_results([JSON.parse(processed)], "#{dir}/#{file}")
|
62
|
+
File.write(get_write_path(dir, file), processed)
|
113
63
|
end
|
114
64
|
|
115
|
-
#
|
116
|
-
def
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
65
|
+
# Create if they don't exist
|
66
|
+
def create_write_dirs(dir)
|
67
|
+
dirs = dir.split("/")
|
68
|
+
dirs.delete("")
|
69
|
+
|
70
|
+
# Go through and create all subdirs
|
71
|
+
overallpath = ""
|
72
|
+
dirs.each do |d|
|
73
|
+
Dir.mkdir(overallpath+"/"+d) if !File.directory?(overallpath+"/"+d)
|
74
|
+
overallpath += ("/"+d)
|
122
75
|
end
|
123
76
|
end
|
124
77
|
|
125
|
-
#
|
126
|
-
def
|
127
|
-
|
128
|
-
|
129
|
-
Curl::PostField.content('selector_id', @selector_id),
|
130
|
-
Curl::PostField.content('status_message', "Processed " + path),
|
131
|
-
Curl::PostField.content('results', JSON.pretty_generate(results)))
|
78
|
+
# Figure out where to write the file
|
79
|
+
def get_write_path(dir, file)
|
80
|
+
dir_save = dir.gsub(@path, @output_dir)
|
81
|
+
return "#{dir_save}/#{file}.json"
|
132
82
|
end
|
133
83
|
|
134
|
-
#
|
135
|
-
def
|
136
|
-
|
84
|
+
# Handle different failure modes
|
85
|
+
def handle_failure(error, dir, file, *args)
|
86
|
+
if @failure_mode == "debug"
|
87
|
+
binding.pry
|
88
|
+
elsif @failure_mode == "log"
|
89
|
+
error_file = "#{dir}/#{file}\n"
|
90
|
+
IO.write(@output_dir+"/error_log.txt", error_file, mode: 'a')
|
91
|
+
end
|
137
92
|
end
|
138
93
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dircrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.15
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2017-
|
12
|
+
date: 2017-05-22 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: Run block on all files in dir
|
15
15
|
email: shidash@shidash.com
|
@@ -38,9 +38,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
38
38
|
version: '0'
|
39
39
|
requirements: []
|
40
40
|
rubyforge_project:
|
41
|
-
rubygems_version: 2.
|
41
|
+
rubygems_version: 2.6.11
|
42
42
|
signing_key:
|
43
43
|
specification_version: 4
|
44
44
|
summary: Run block on all files in dir
|
45
45
|
test_files: []
|
46
|
-
has_rdoc:
|