datahen 0.10.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.travis.yml +7 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +29 -0
- data/Rakefile +22 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/datahen.gemspec +47 -0
- data/examples/fetchtest/libraries/hello.rb +9 -0
- data/examples/fetchtest/libraries/hello_fail.rb +10 -0
- data/examples/fetchtest/parsers/failed.rb +2 -0
- data/examples/fetchtest/parsers/find_outputs.rb +18 -0
- data/examples/fetchtest/parsers/home.rb +50 -0
- data/examples/fetchtest/parsers/nested_fail.rb +3 -0
- data/examples/fetchtest/parsers/simple.rb +14 -0
- data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
- data/examples/fetchtest/seeders/failed.rb +1 -0
- data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
- data/examples/fetchtest/seeders/seed.rb +28 -0
- data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
- data/exe/hen +3 -0
- data/lib/datahen.rb +5 -0
- data/lib/datahen/cli.rb +45 -0
- data/lib/datahen/cli/env_var.rb +48 -0
- data/lib/datahen/cli/finisher.rb +40 -0
- data/lib/datahen/cli/global_page.rb +39 -0
- data/lib/datahen/cli/job.rb +30 -0
- data/lib/datahen/cli/job_output.rb +69 -0
- data/lib/datahen/cli/parser.rb +64 -0
- data/lib/datahen/cli/scraper.rb +185 -0
- data/lib/datahen/cli/scraper_deployment.rb +24 -0
- data/lib/datahen/cli/scraper_export.rb +51 -0
- data/lib/datahen/cli/scraper_exporter.rb +40 -0
- data/lib/datahen/cli/scraper_finisher.rb +20 -0
- data/lib/datahen/cli/scraper_job.rb +75 -0
- data/lib/datahen/cli/scraper_job_var.rb +48 -0
- data/lib/datahen/cli/scraper_page.rb +203 -0
- data/lib/datahen/cli/scraper_var.rb +48 -0
- data/lib/datahen/cli/seeder.rb +40 -0
- data/lib/datahen/client.rb +29 -0
- data/lib/datahen/client/auth_token.rb +50 -0
- data/lib/datahen/client/backblaze_content.rb +45 -0
- data/lib/datahen/client/base.rb +69 -0
- data/lib/datahen/client/deploy_key.rb +21 -0
- data/lib/datahen/client/env_var.rb +28 -0
- data/lib/datahen/client/export.rb +10 -0
- data/lib/datahen/client/global_page.rb +18 -0
- data/lib/datahen/client/job.rb +64 -0
- data/lib/datahen/client/job_export.rb +10 -0
- data/lib/datahen/client/job_log.rb +26 -0
- data/lib/datahen/client/job_output.rb +19 -0
- data/lib/datahen/client/job_page.rb +58 -0
- data/lib/datahen/client/job_stat.rb +16 -0
- data/lib/datahen/client/scraper.rb +57 -0
- data/lib/datahen/client/scraper_deployment.rb +18 -0
- data/lib/datahen/client/scraper_export.rb +22 -0
- data/lib/datahen/client/scraper_exporter.rb +14 -0
- data/lib/datahen/client/scraper_finisher.rb +16 -0
- data/lib/datahen/client/scraper_job.rb +49 -0
- data/lib/datahen/client/scraper_job_output.rb +19 -0
- data/lib/datahen/client/scraper_job_page.rb +67 -0
- data/lib/datahen/client/scraper_job_var.rb +28 -0
- data/lib/datahen/client/scraper_var.rb +28 -0
- data/lib/datahen/plugin.rb +6 -0
- data/lib/datahen/plugin/context_exposer.rb +55 -0
- data/lib/datahen/scraper.rb +18 -0
- data/lib/datahen/scraper/executor.rb +373 -0
- data/lib/datahen/scraper/finisher.rb +18 -0
- data/lib/datahen/scraper/parser.rb +18 -0
- data/lib/datahen/scraper/ruby_finisher_executor.rb +116 -0
- data/lib/datahen/scraper/ruby_parser_executor.rb +200 -0
- data/lib/datahen/scraper/ruby_seeder_executor.rb +120 -0
- data/lib/datahen/scraper/seeder.rb +18 -0
- data/lib/datahen/version.rb +3 -0
- metadata +270 -0
@@ -0,0 +1,18 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Scraper
|
3
|
+
class Finisher
|
4
|
+
|
5
|
+
def self.exec_finisher(filename, job_id=nil, save=false)
|
6
|
+
extname = File.extname(filename)
|
7
|
+
case extname
|
8
|
+
when '.rb'
|
9
|
+
executor = RubyFinisherExecutor.new(filename: filename, job_id: job_id)
|
10
|
+
executor.exec_finisher(save)
|
11
|
+
else
|
12
|
+
puts "Unable to find a finisher executor for file type \"#{extname}\""
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Scraper
|
3
|
+
class Parser
|
4
|
+
def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {})
|
5
|
+
extname = File.extname(filename)
|
6
|
+
case extname
|
7
|
+
when '.rb'
|
8
|
+
executor = RubyParserExecutor.new(filename: filename, gid: gid, job_id: job_id, vars: vars)
|
9
|
+
executor.exec_parser(save)
|
10
|
+
else
|
11
|
+
puts "Unable to find a parser executor for file type \"#{extname}\""
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Scraper
|
3
|
+
class RubyFinisherExecutor < Executor
|
4
|
+
attr_accessor :save
|
5
|
+
|
6
|
+
def initialize(options={})
|
7
|
+
@filename = options.fetch(:filename) { raise "Filename is required"}
|
8
|
+
@job_id = options[:job_id]
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.exposed_methods
|
12
|
+
[
|
13
|
+
:outputs,
|
14
|
+
:save_outputs,
|
15
|
+
:find_output,
|
16
|
+
:find_outputs
|
17
|
+
].freeze
|
18
|
+
end
|
19
|
+
|
20
|
+
def exec_finisher(save=false)
|
21
|
+
@save = save
|
22
|
+
if save
|
23
|
+
puts "Executing finisher script"
|
24
|
+
else
|
25
|
+
puts "Trying finisher script"
|
26
|
+
end
|
27
|
+
|
28
|
+
eval_finisher_script(save)
|
29
|
+
end
|
30
|
+
|
31
|
+
def eval_finisher_script(save=false)
|
32
|
+
update_finisher_starting_status
|
33
|
+
|
34
|
+
proc = Proc.new do
|
35
|
+
outputs = []
|
36
|
+
|
37
|
+
begin
|
38
|
+
context = isolated_binding({
|
39
|
+
outputs: outputs,
|
40
|
+
job_id: job_id
|
41
|
+
})
|
42
|
+
eval_with_context filename, context
|
43
|
+
rescue SyntaxError => e
|
44
|
+
handle_error(e) if save
|
45
|
+
raise e
|
46
|
+
rescue => e
|
47
|
+
handle_error(e) if save
|
48
|
+
raise e
|
49
|
+
end
|
50
|
+
|
51
|
+
puts "=========== Finisher Executed ==========="
|
52
|
+
begin
|
53
|
+
save_outputs(outputs)
|
54
|
+
rescue => e
|
55
|
+
handle_error(e) if save
|
56
|
+
raise e
|
57
|
+
end
|
58
|
+
|
59
|
+
update_finisher_done_status
|
60
|
+
end
|
61
|
+
proc.call
|
62
|
+
end
|
63
|
+
|
64
|
+
def save_type
|
65
|
+
:executing
|
66
|
+
end
|
67
|
+
|
68
|
+
def update_to_server(opts = {})
|
69
|
+
finisher_update(
|
70
|
+
job_id: opts[:job_id],
|
71
|
+
outputs: opts[:outputs],
|
72
|
+
finisher_status: opts[:status])
|
73
|
+
end
|
74
|
+
|
75
|
+
def update_finisher_starting_status
|
76
|
+
return unless save
|
77
|
+
|
78
|
+
response = finisher_update(
|
79
|
+
job_id: job_id,
|
80
|
+
finisher_status: :starting)
|
81
|
+
|
82
|
+
if response.code == 200
|
83
|
+
puts "Finisher Status Updated."
|
84
|
+
else
|
85
|
+
puts "Error: Unable to save Finisher Status to server: #{response.body}"
|
86
|
+
raise "Unable to save Finisher Status to server: #{response.body}"
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def update_finisher_done_status
|
91
|
+
return unless save
|
92
|
+
|
93
|
+
response = finisher_update(
|
94
|
+
job_id: job_id,
|
95
|
+
finisher_status: :done)
|
96
|
+
|
97
|
+
if response.code == 200
|
98
|
+
puts "Finisher Done."
|
99
|
+
else
|
100
|
+
puts "Error: Unable to save Finisher Done Status to server: #{response.body}"
|
101
|
+
raise "Unable to save Finisher Done Status to server: #{response.body}"
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def handle_error(e)
|
106
|
+
error = ["Finisher #{e.class}: #{e.to_s} (Job:#{job_id}",clean_backtrace(e.backtrace)].join("\n")
|
107
|
+
|
108
|
+
finisher_update(
|
109
|
+
job_id: job_id,
|
110
|
+
finisher_status: :failed,
|
111
|
+
log_error: error)
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
@@ -0,0 +1,200 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Scraper
|
3
|
+
class RubyParserExecutor < Executor
|
4
|
+
attr_accessor :save
|
5
|
+
# Refetch self page flag.
|
6
|
+
# @return [Boollean]
|
7
|
+
# @note It is stronger than #reparse_self flag.
|
8
|
+
attr_accessor :refetch_self
|
9
|
+
# Reparse self page flag.
|
10
|
+
# @return [Boollean]
|
11
|
+
attr_accessor :reparse_self
|
12
|
+
|
13
|
+
def initialize(options={})
|
14
|
+
@filename = options.fetch(:filename) { raise "Filename is required"}
|
15
|
+
@gid = options.fetch(:gid) { raise "GID is required"}
|
16
|
+
@job_id = options.fetch(:job_id)
|
17
|
+
@page_vars = options.fetch(:vars) { {} }
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.exposed_methods
|
21
|
+
[
|
22
|
+
:content,
|
23
|
+
:failed_content,
|
24
|
+
:outputs,
|
25
|
+
:pages,
|
26
|
+
:page,
|
27
|
+
:save_pages,
|
28
|
+
:save_outputs,
|
29
|
+
:find_output,
|
30
|
+
:find_outputs,
|
31
|
+
:refetch,
|
32
|
+
:reparse
|
33
|
+
].freeze
|
34
|
+
end
|
35
|
+
|
36
|
+
def exec_parser(save=false)
|
37
|
+
@save = save
|
38
|
+
if save
|
39
|
+
puts "Executing parser script"
|
40
|
+
else
|
41
|
+
puts "Trying parser script"
|
42
|
+
end
|
43
|
+
|
44
|
+
eval_parser_script(save)
|
45
|
+
end
|
46
|
+
|
47
|
+
def init_page_vars(page)
|
48
|
+
if !@page_vars.nil? && !@page_vars.empty?
|
49
|
+
page['vars'] = @page_vars
|
50
|
+
end
|
51
|
+
page
|
52
|
+
end
|
53
|
+
|
54
|
+
def update_to_server(opts = {})
|
55
|
+
parsing_update(
|
56
|
+
job_id: opts[:job_id],
|
57
|
+
gid: opts[:gid],
|
58
|
+
pages: opts[:pages],
|
59
|
+
outputs: opts[:outputs],
|
60
|
+
parsing_status: opts[:status])
|
61
|
+
end
|
62
|
+
|
63
|
+
def update_parsing_starting_status
|
64
|
+
return unless save
|
65
|
+
|
66
|
+
response = parsing_update(
|
67
|
+
job_id: job_id,
|
68
|
+
gid: gid,
|
69
|
+
parsing_status: :starting)
|
70
|
+
|
71
|
+
if response.code == 200
|
72
|
+
puts "Page Parsing Status Updated."
|
73
|
+
else
|
74
|
+
puts "Error: Unable to save Page Parsing Status to server: #{response.body}"
|
75
|
+
raise "Unable to save Page Parsing Status to server: #{response.body}"
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def update_parsing_done_status
|
80
|
+
return unless save
|
81
|
+
|
82
|
+
response = parsing_update(
|
83
|
+
job_id: job_id,
|
84
|
+
gid: gid,
|
85
|
+
parsing_status: :done)
|
86
|
+
|
87
|
+
if response.code == 200
|
88
|
+
puts "Page Parsing Done."
|
89
|
+
else
|
90
|
+
puts "Error: Unable to save Page Parsing Done Status to server: #{response.body}"
|
91
|
+
raise "Unable to save Page Parsing Done Status to server: #{response.body}"
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def save_type
|
96
|
+
:parsing
|
97
|
+
end
|
98
|
+
|
99
|
+
def refetch_page gid
|
100
|
+
if save
|
101
|
+
Client::ScraperJobPage.new({gid: gid}).refetch_by_job(self.job_id)
|
102
|
+
puts "Refetch page #{gid}"
|
103
|
+
else
|
104
|
+
puts "Would have refetch page #{gid}"
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def refetch page_gid
|
109
|
+
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
|
110
|
+
if page_gid == gid
|
111
|
+
self.refetch_self = true
|
112
|
+
return
|
113
|
+
end
|
114
|
+
refetch_page page_gid
|
115
|
+
end
|
116
|
+
|
117
|
+
def reparse_page gid
|
118
|
+
if save
|
119
|
+
Client::ScraperJobPage.new({gid: gid}).reparse_by_job(self.job_id)
|
120
|
+
puts "Reparse page #{gid}"
|
121
|
+
else
|
122
|
+
puts "Would have reparse page #{gid}"
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def reparse page_gid
|
127
|
+
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
|
128
|
+
if page_gid == gid
|
129
|
+
self.reparse_self = true
|
130
|
+
return
|
131
|
+
end
|
132
|
+
reparse_page page_gid
|
133
|
+
end
|
134
|
+
|
135
|
+
def eval_parser_script(save=false)
|
136
|
+
update_parsing_starting_status
|
137
|
+
|
138
|
+
proc = Proc.new do
|
139
|
+
page = init_page
|
140
|
+
outputs = []
|
141
|
+
pages = []
|
142
|
+
page = init_page_vars(page)
|
143
|
+
self.refetch_self = false
|
144
|
+
self.reparse_self = false
|
145
|
+
|
146
|
+
begin
|
147
|
+
context = isolated_binding({
|
148
|
+
outputs: outputs,
|
149
|
+
pages: pages,
|
150
|
+
page: page
|
151
|
+
})
|
152
|
+
eval_with_context filename, context
|
153
|
+
rescue SyntaxError => e
|
154
|
+
handle_error(e) if save
|
155
|
+
raise e
|
156
|
+
rescue => e
|
157
|
+
handle_error(e) if save
|
158
|
+
raise e
|
159
|
+
end
|
160
|
+
|
161
|
+
puts "=========== Parsing Executed ==========="
|
162
|
+
begin
|
163
|
+
save_pages_and_outputs(pages, outputs, :parsing)
|
164
|
+
rescue => e
|
165
|
+
handle_error(e) if save
|
166
|
+
raise e
|
167
|
+
end
|
168
|
+
|
169
|
+
if refetch_self
|
170
|
+
refetch_page gid
|
171
|
+
elsif reparse_self
|
172
|
+
reparse_page gid
|
173
|
+
else
|
174
|
+
update_parsing_done_status
|
175
|
+
end
|
176
|
+
end
|
177
|
+
proc.call
|
178
|
+
end
|
179
|
+
|
180
|
+
def content
|
181
|
+
@content ||= get_content(gid)
|
182
|
+
end
|
183
|
+
|
184
|
+
def failed_content
|
185
|
+
@failed_content ||= get_failed_content(gid)
|
186
|
+
end
|
187
|
+
|
188
|
+
def handle_error(e)
|
189
|
+
error = ["Parsing #{e.class}: #{e.to_s} (Job:#{job_id} GID:#{gid})",clean_backtrace(e.backtrace)].join("\n")
|
190
|
+
|
191
|
+
parsing_update(
|
192
|
+
job_id: job_id,
|
193
|
+
gid: gid,
|
194
|
+
parsing_status: :failed,
|
195
|
+
log_error: error)
|
196
|
+
end
|
197
|
+
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Scraper
|
3
|
+
class RubySeederExecutor < Executor
|
4
|
+
attr_accessor :save
|
5
|
+
|
6
|
+
def initialize(options={})
|
7
|
+
@filename = options.fetch(:filename) { raise "Filename is required"}
|
8
|
+
@job_id = options[:job_id]
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.exposed_methods
|
12
|
+
[
|
13
|
+
:outputs,
|
14
|
+
:pages,
|
15
|
+
:save_pages,
|
16
|
+
:save_outputs,
|
17
|
+
:find_output,
|
18
|
+
:find_outputs
|
19
|
+
].freeze
|
20
|
+
end
|
21
|
+
|
22
|
+
def exec_seeder(save=false)
|
23
|
+
@save = save
|
24
|
+
if save
|
25
|
+
puts "Executing seeder script"
|
26
|
+
else
|
27
|
+
puts "Trying seeder script"
|
28
|
+
end
|
29
|
+
|
30
|
+
eval_seeder_script(save)
|
31
|
+
end
|
32
|
+
|
33
|
+
def eval_seeder_script(save=false)
|
34
|
+
update_seeding_starting_status
|
35
|
+
|
36
|
+
proc = Proc.new do
|
37
|
+
outputs = []
|
38
|
+
pages = []
|
39
|
+
|
40
|
+
begin
|
41
|
+
context = isolated_binding({
|
42
|
+
outputs: outputs,
|
43
|
+
pages: pages
|
44
|
+
})
|
45
|
+
eval_with_context filename, context
|
46
|
+
rescue SyntaxError => e
|
47
|
+
handle_error(e) if save
|
48
|
+
raise e
|
49
|
+
rescue => e
|
50
|
+
handle_error(e) if save
|
51
|
+
raise e
|
52
|
+
end
|
53
|
+
|
54
|
+
puts "=========== Seeding Executed ==========="
|
55
|
+
begin
|
56
|
+
save_pages_and_outputs(pages, outputs, :seeding)
|
57
|
+
rescue => e
|
58
|
+
handle_error(e) if save
|
59
|
+
raise e
|
60
|
+
end
|
61
|
+
|
62
|
+
update_seeding_done_status
|
63
|
+
end
|
64
|
+
proc.call
|
65
|
+
end
|
66
|
+
|
67
|
+
def save_type
|
68
|
+
:seeding
|
69
|
+
end
|
70
|
+
|
71
|
+
def update_to_server(opts = {})
|
72
|
+
seeding_update(
|
73
|
+
job_id: opts[:job_id],
|
74
|
+
pages: opts[:pages],
|
75
|
+
outputs: opts[:outputs],
|
76
|
+
seeding_status: opts[:status])
|
77
|
+
end
|
78
|
+
|
79
|
+
def update_seeding_starting_status
|
80
|
+
return unless save
|
81
|
+
|
82
|
+
response = seeding_update(
|
83
|
+
job_id: job_id,
|
84
|
+
seeding_status: :starting)
|
85
|
+
|
86
|
+
if response.code == 200
|
87
|
+
puts "Seeding Status Updated."
|
88
|
+
else
|
89
|
+
puts "Error: Unable to save Seeding Status to server: #{response.body}"
|
90
|
+
raise "Unable to save Seeding Status to server: #{response.body}"
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def update_seeding_done_status
|
95
|
+
return unless save
|
96
|
+
|
97
|
+
response = seeding_update(
|
98
|
+
job_id: job_id,
|
99
|
+
seeding_status: :done)
|
100
|
+
|
101
|
+
if response.code == 200
|
102
|
+
puts "Seeding Done."
|
103
|
+
else
|
104
|
+
puts "Error: Unable to save Seeding Done Status to server: #{response.body}"
|
105
|
+
raise "Unable to save Seeding Done Status to server: #{response.body}"
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def handle_error(e)
|
110
|
+
error = ["Seeding #{e.class}: #{e.to_s} (Job:#{job_id}",clean_backtrace(e.backtrace)].join("\n")
|
111
|
+
|
112
|
+
seeding_update(
|
113
|
+
job_id: job_id,
|
114
|
+
seeding_status: :failed,
|
115
|
+
log_error: error)
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|