dircrawl 0.0.14 → 0.0.15
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/dircrawl.rb +55 -100
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 042b2465ba3729dd8b32b61a6fa55a9732ce6bb0
|
4
|
+
data.tar.gz: 3c8bb1ef5cf0fffed384f91fdda387f41185ed97
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 29b0dd91babcdf8600b2ac3c7600749986018e418706f539574c274dca51df7ca8e649dcd861dda1869ac2cde206d2fd1b45a4643d260055ad698bf8711efc9e
|
7
|
+
data.tar.gz: 4bda8db830eedc8f15e11f104874881034ddedc67434d5e86736cdd43ef5ea013cd9a21ff2566c7a857e8409265f3df25366e7f865479999a1090d79dbaef7fa
|
data/lib/dircrawl.rb
CHANGED
@@ -1,47 +1,27 @@
|
|
1
1
|
require 'json'
|
2
2
|
require 'pry'
|
3
|
-
require '
|
4
|
-
require 'selenium-webdriver'
|
5
|
-
require 'uri'
|
3
|
+
require 'harvesterreporter'
|
6
4
|
|
5
|
+
# Crawls a directory of files and runs a block of code on it
|
7
6
|
class DirCrawl
|
8
|
-
def initialize(
|
9
|
-
|
10
|
-
@
|
11
|
-
@
|
7
|
+
def initialize(path_params, process_block, include_block, extras_block, cm_hash, *args)
|
8
|
+
# Set the params for the path
|
9
|
+
@path = path_params[:path]
|
10
|
+
@output_dir = path_params[:output_dir]
|
11
|
+
@ignore_includes = path_params[:ignore_includes]
|
12
|
+
@failure_mode = path_params[:failure_mode]
|
13
|
+
|
14
|
+
# Setup the blocks to run
|
12
15
|
include_block.call
|
13
16
|
@process_block = process_block
|
14
17
|
@extras_block = extras_block
|
15
|
-
@failure_mode = failure_mode
|
16
|
-
@output = Array.new
|
17
|
-
@save = save
|
18
18
|
|
19
|
-
#
|
20
|
-
@
|
21
|
-
@
|
22
|
-
|
23
|
-
# Crawl
|
24
|
-
crawl_dir(path, *args)
|
19
|
+
# Setup the Harvester reporter to report the results
|
20
|
+
@reporter = HarvesterReporter.new(cm_hash)
|
21
|
+
crawl_dir(@path, *args)
|
25
22
|
end
|
26
23
|
|
27
|
-
#
|
28
|
-
def get_write_dir(dir, file)
|
29
|
-
dir_save = dir.gsub(@path, @output_dir)
|
30
|
-
return dir_save+"/"+file+".json"
|
31
|
-
end
|
32
|
-
|
33
|
-
# Create if they don't exist
|
34
|
-
def create_write_dirs(dir)
|
35
|
-
dirs = dir.split("/")
|
36
|
-
dirs.delete("")
|
37
|
-
overallpath = ""
|
38
|
-
dirs.each do |d|
|
39
|
-
Dir.mkdir(overallpath+"/"+d) if !File.directory?(overallpath+"/"+d)
|
40
|
-
overallpath += ("/"+d)
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
# Crawl dir and call block for each file
|
24
|
+
# Crawls the directory sppecified
|
45
25
|
def crawl_dir(dir, *args)
|
46
26
|
Dir.foreach(dir) do |file|
|
47
27
|
# Skip . or .. files
|
@@ -49,90 +29,65 @@ class DirCrawl
|
|
49
29
|
|
50
30
|
# Recurse into directories
|
51
31
|
if File.directory?(dir+"/"+file)
|
52
|
-
|
53
|
-
crawl_dir(dir+"/"+file, *args)
|
32
|
+
crawl_dir("#{dir}/#{file}", *args)
|
54
33
|
|
55
34
|
# Process file
|
56
35
|
elsif !file.include?(@ignore_includes)
|
57
|
-
|
58
|
-
# Create Output Directory
|
59
|
-
create_write_dirs(dir.gsub(@path, @output_dir))
|
60
|
-
|
61
36
|
begin
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
if !File.exist?(get_write_dir(dir, file))
|
66
|
-
|
67
|
-
# Process Extras
|
68
|
-
if @extras_block != ""
|
69
|
-
extras = @extras_block.call(@output_dir+"/")
|
70
|
-
end
|
71
|
-
|
72
|
-
# Now Process Main
|
73
|
-
processed = @process_block.call(dir+"/"+file, *args)
|
74
|
-
else
|
75
|
-
puts "Processed file exists, skipping"
|
76
|
-
puts " " + dir + "/" + file
|
77
|
-
processed = File.read(get_write_dir(dir, file))
|
78
|
-
end
|
79
|
-
|
80
|
-
rescue Exception => e # really catch any failures
|
81
|
-
report_status("Error on file "+file+": "+e.to_s)
|
82
|
-
if @failure_mode == "debug"
|
83
|
-
binding.pry
|
84
|
-
elsif @failure_mode == "log"
|
85
|
-
error_file = dir + "/" + file + "\n"
|
86
|
-
IO.write(@output_dir+"/error_log.txt", error_file, mode: 'a')
|
87
|
-
end
|
37
|
+
output_results(process_file(dir, file, *args), dir, file)
|
38
|
+
rescue Exception => e
|
39
|
+
handle_failure(e, dir, file, *args)
|
88
40
|
end
|
89
|
-
|
90
|
-
# Only save in output if specified (to handle large dirs)
|
91
|
-
report_results([JSON.parse(processed)], dir+"/"+file)
|
92
|
-
|
93
|
-
# Write Output to file
|
94
|
-
File.write(get_write_dir(dir, file), processed)
|
95
41
|
end
|
96
42
|
end
|
97
43
|
end
|
98
44
|
|
99
|
-
#
|
100
|
-
def
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
45
|
+
# Process a file using the blocks given
|
46
|
+
def process_file(dir, file, *args)
|
47
|
+
create_write_dirs(dir.gsub(@path, @output_dir))
|
48
|
+
|
49
|
+
# Run blocks to process the file
|
50
|
+
if !File.exist?(get_write_path(dir, file))
|
51
|
+
@extras_block.call("#{@output_dir}/") if !@extras_block.empty?
|
52
|
+
return @process_block.call("#{dir}/#{file}", *args)
|
53
|
+
else # Use already existing file
|
54
|
+
puts "Processed file exists, skipping: #{dir}/#{file}"
|
55
|
+
return File.read(get_write_path(dir, file))
|
105
56
|
end
|
106
57
|
end
|
107
58
|
|
108
|
-
#
|
109
|
-
def
|
110
|
-
|
111
|
-
|
112
|
-
end
|
59
|
+
# Output the results to Harvester and file dir
|
60
|
+
def output_results(processed, dir, file)
|
61
|
+
@reporter.report_results([JSON.parse(processed)], "#{dir}/#{file}")
|
62
|
+
File.write(get_write_path(dir, file), processed)
|
113
63
|
end
|
114
64
|
|
115
|
-
#
|
116
|
-
def
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
65
|
+
# Create if they don't exist
|
66
|
+
def create_write_dirs(dir)
|
67
|
+
dirs = dir.split("/")
|
68
|
+
dirs.delete("")
|
69
|
+
|
70
|
+
# Go through and create all subdirs
|
71
|
+
overallpath = ""
|
72
|
+
dirs.each do |d|
|
73
|
+
Dir.mkdir(overallpath+"/"+d) if !File.directory?(overallpath+"/"+d)
|
74
|
+
overallpath += ("/"+d)
|
122
75
|
end
|
123
76
|
end
|
124
77
|
|
125
|
-
#
|
126
|
-
def
|
127
|
-
|
128
|
-
|
129
|
-
Curl::PostField.content('selector_id', @selector_id),
|
130
|
-
Curl::PostField.content('status_message', "Processed " + path),
|
131
|
-
Curl::PostField.content('results', JSON.pretty_generate(results)))
|
78
|
+
# Figure out where to write the file
|
79
|
+
def get_write_path(dir, file)
|
80
|
+
dir_save = dir.gsub(@path, @output_dir)
|
81
|
+
return "#{dir_save}/#{file}.json"
|
132
82
|
end
|
133
83
|
|
134
|
-
#
|
135
|
-
def
|
136
|
-
|
84
|
+
# Handle different failure modes
|
85
|
+
def handle_failure(error, dir, file, *args)
|
86
|
+
if @failure_mode == "debug"
|
87
|
+
binding.pry
|
88
|
+
elsif @failure_mode == "log"
|
89
|
+
error_file = "#{dir}/#{file}\n"
|
90
|
+
IO.write(@output_dir+"/error_log.txt", error_file, mode: 'a')
|
91
|
+
end
|
137
92
|
end
|
138
93
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dircrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.15
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2017-
|
12
|
+
date: 2017-05-22 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: Run block on all files in dir
|
15
15
|
email: shidash@shidash.com
|
@@ -38,9 +38,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
38
38
|
version: '0'
|
39
39
|
requirements: []
|
40
40
|
rubyforge_project:
|
41
|
-
rubygems_version: 2.
|
41
|
+
rubygems_version: 2.6.11
|
42
42
|
signing_key:
|
43
43
|
specification_version: 4
|
44
44
|
summary: Run block on all files in dir
|
45
45
|
test_files: []
|
46
|
-
has_rdoc:
|