datahen 0.10.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.travis.yml +7 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +29 -0
- data/Rakefile +22 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/datahen.gemspec +47 -0
- data/examples/fetchtest/libraries/hello.rb +9 -0
- data/examples/fetchtest/libraries/hello_fail.rb +10 -0
- data/examples/fetchtest/parsers/failed.rb +2 -0
- data/examples/fetchtest/parsers/find_outputs.rb +18 -0
- data/examples/fetchtest/parsers/home.rb +50 -0
- data/examples/fetchtest/parsers/nested_fail.rb +3 -0
- data/examples/fetchtest/parsers/simple.rb +14 -0
- data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
- data/examples/fetchtest/seeders/failed.rb +1 -0
- data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
- data/examples/fetchtest/seeders/seed.rb +28 -0
- data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
- data/exe/hen +3 -0
- data/lib/datahen.rb +5 -0
- data/lib/datahen/cli.rb +45 -0
- data/lib/datahen/cli/env_var.rb +48 -0
- data/lib/datahen/cli/finisher.rb +40 -0
- data/lib/datahen/cli/global_page.rb +39 -0
- data/lib/datahen/cli/job.rb +30 -0
- data/lib/datahen/cli/job_output.rb +69 -0
- data/lib/datahen/cli/parser.rb +64 -0
- data/lib/datahen/cli/scraper.rb +185 -0
- data/lib/datahen/cli/scraper_deployment.rb +24 -0
- data/lib/datahen/cli/scraper_export.rb +51 -0
- data/lib/datahen/cli/scraper_exporter.rb +40 -0
- data/lib/datahen/cli/scraper_finisher.rb +20 -0
- data/lib/datahen/cli/scraper_job.rb +75 -0
- data/lib/datahen/cli/scraper_job_var.rb +48 -0
- data/lib/datahen/cli/scraper_page.rb +203 -0
- data/lib/datahen/cli/scraper_var.rb +48 -0
- data/lib/datahen/cli/seeder.rb +40 -0
- data/lib/datahen/client.rb +29 -0
- data/lib/datahen/client/auth_token.rb +50 -0
- data/lib/datahen/client/backblaze_content.rb +45 -0
- data/lib/datahen/client/base.rb +69 -0
- data/lib/datahen/client/deploy_key.rb +21 -0
- data/lib/datahen/client/env_var.rb +28 -0
- data/lib/datahen/client/export.rb +10 -0
- data/lib/datahen/client/global_page.rb +18 -0
- data/lib/datahen/client/job.rb +64 -0
- data/lib/datahen/client/job_export.rb +10 -0
- data/lib/datahen/client/job_log.rb +26 -0
- data/lib/datahen/client/job_output.rb +19 -0
- data/lib/datahen/client/job_page.rb +58 -0
- data/lib/datahen/client/job_stat.rb +16 -0
- data/lib/datahen/client/scraper.rb +57 -0
- data/lib/datahen/client/scraper_deployment.rb +18 -0
- data/lib/datahen/client/scraper_export.rb +22 -0
- data/lib/datahen/client/scraper_exporter.rb +14 -0
- data/lib/datahen/client/scraper_finisher.rb +16 -0
- data/lib/datahen/client/scraper_job.rb +49 -0
- data/lib/datahen/client/scraper_job_output.rb +19 -0
- data/lib/datahen/client/scraper_job_page.rb +67 -0
- data/lib/datahen/client/scraper_job_var.rb +28 -0
- data/lib/datahen/client/scraper_var.rb +28 -0
- data/lib/datahen/plugin.rb +6 -0
- data/lib/datahen/plugin/context_exposer.rb +55 -0
- data/lib/datahen/scraper.rb +18 -0
- data/lib/datahen/scraper/executor.rb +373 -0
- data/lib/datahen/scraper/finisher.rb +18 -0
- data/lib/datahen/scraper/parser.rb +18 -0
- data/lib/datahen/scraper/ruby_finisher_executor.rb +116 -0
- data/lib/datahen/scraper/ruby_parser_executor.rb +200 -0
- data/lib/datahen/scraper/ruby_seeder_executor.rb +120 -0
- data/lib/datahen/scraper/seeder.rb +18 -0
- data/lib/datahen/version.rb +3 -0
- metadata +270 -0
@@ -0,0 +1,18 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Scraper
|
3
|
+
class Finisher
|
4
|
+
|
5
|
+
def self.exec_finisher(filename, job_id=nil, save=false)
|
6
|
+
extname = File.extname(filename)
|
7
|
+
case extname
|
8
|
+
when '.rb'
|
9
|
+
executor = RubyFinisherExecutor.new(filename: filename, job_id: job_id)
|
10
|
+
executor.exec_finisher(save)
|
11
|
+
else
|
12
|
+
puts "Unable to find a finisher executor for file type \"#{extname}\""
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Scraper
|
3
|
+
class Parser
|
4
|
+
def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {})
|
5
|
+
extname = File.extname(filename)
|
6
|
+
case extname
|
7
|
+
when '.rb'
|
8
|
+
executor = RubyParserExecutor.new(filename: filename, gid: gid, job_id: job_id, vars: vars)
|
9
|
+
executor.exec_parser(save)
|
10
|
+
else
|
11
|
+
puts "Unable to find a parser executor for file type \"#{extname}\""
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Scraper
|
3
|
+
class RubyFinisherExecutor < Executor
|
4
|
+
attr_accessor :save
|
5
|
+
|
6
|
+
def initialize(options={})
|
7
|
+
@filename = options.fetch(:filename) { raise "Filename is required"}
|
8
|
+
@job_id = options[:job_id]
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.exposed_methods
|
12
|
+
[
|
13
|
+
:outputs,
|
14
|
+
:save_outputs,
|
15
|
+
:find_output,
|
16
|
+
:find_outputs
|
17
|
+
].freeze
|
18
|
+
end
|
19
|
+
|
20
|
+
def exec_finisher(save=false)
|
21
|
+
@save = save
|
22
|
+
if save
|
23
|
+
puts "Executing finisher script"
|
24
|
+
else
|
25
|
+
puts "Trying finisher script"
|
26
|
+
end
|
27
|
+
|
28
|
+
eval_finisher_script(save)
|
29
|
+
end
|
30
|
+
|
31
|
+
def eval_finisher_script(save=false)
|
32
|
+
update_finisher_starting_status
|
33
|
+
|
34
|
+
proc = Proc.new do
|
35
|
+
outputs = []
|
36
|
+
|
37
|
+
begin
|
38
|
+
context = isolated_binding({
|
39
|
+
outputs: outputs,
|
40
|
+
job_id: job_id
|
41
|
+
})
|
42
|
+
eval_with_context filename, context
|
43
|
+
rescue SyntaxError => e
|
44
|
+
handle_error(e) if save
|
45
|
+
raise e
|
46
|
+
rescue => e
|
47
|
+
handle_error(e) if save
|
48
|
+
raise e
|
49
|
+
end
|
50
|
+
|
51
|
+
puts "=========== Finisher Executed ==========="
|
52
|
+
begin
|
53
|
+
save_outputs(outputs)
|
54
|
+
rescue => e
|
55
|
+
handle_error(e) if save
|
56
|
+
raise e
|
57
|
+
end
|
58
|
+
|
59
|
+
update_finisher_done_status
|
60
|
+
end
|
61
|
+
proc.call
|
62
|
+
end
|
63
|
+
|
64
|
+
def save_type
|
65
|
+
:executing
|
66
|
+
end
|
67
|
+
|
68
|
+
def update_to_server(opts = {})
|
69
|
+
finisher_update(
|
70
|
+
job_id: opts[:job_id],
|
71
|
+
outputs: opts[:outputs],
|
72
|
+
finisher_status: opts[:status])
|
73
|
+
end
|
74
|
+
|
75
|
+
def update_finisher_starting_status
|
76
|
+
return unless save
|
77
|
+
|
78
|
+
response = finisher_update(
|
79
|
+
job_id: job_id,
|
80
|
+
finisher_status: :starting)
|
81
|
+
|
82
|
+
if response.code == 200
|
83
|
+
puts "Finisher Status Updated."
|
84
|
+
else
|
85
|
+
puts "Error: Unable to save Finisher Status to server: #{response.body}"
|
86
|
+
raise "Unable to save Finisher Status to server: #{response.body}"
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def update_finisher_done_status
|
91
|
+
return unless save
|
92
|
+
|
93
|
+
response = finisher_update(
|
94
|
+
job_id: job_id,
|
95
|
+
finisher_status: :done)
|
96
|
+
|
97
|
+
if response.code == 200
|
98
|
+
puts "Finisher Done."
|
99
|
+
else
|
100
|
+
puts "Error: Unable to save Finisher Done Status to server: #{response.body}"
|
101
|
+
raise "Unable to save Finisher Done Status to server: #{response.body}"
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def handle_error(e)
|
106
|
+
error = ["Finisher #{e.class}: #{e.to_s} (Job:#{job_id}",clean_backtrace(e.backtrace)].join("\n")
|
107
|
+
|
108
|
+
finisher_update(
|
109
|
+
job_id: job_id,
|
110
|
+
finisher_status: :failed,
|
111
|
+
log_error: error)
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
@@ -0,0 +1,200 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Scraper
|
3
|
+
class RubyParserExecutor < Executor
|
4
|
+
attr_accessor :save
|
5
|
+
# Refetch self page flag.
|
6
|
+
# @return [Boollean]
|
7
|
+
# @note It is stronger than #reparse_self flag.
|
8
|
+
attr_accessor :refetch_self
|
9
|
+
# Reparse self page flag.
|
10
|
+
# @return [Boollean]
|
11
|
+
attr_accessor :reparse_self
|
12
|
+
|
13
|
+
def initialize(options={})
|
14
|
+
@filename = options.fetch(:filename) { raise "Filename is required"}
|
15
|
+
@gid = options.fetch(:gid) { raise "GID is required"}
|
16
|
+
@job_id = options.fetch(:job_id)
|
17
|
+
@page_vars = options.fetch(:vars) { {} }
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.exposed_methods
|
21
|
+
[
|
22
|
+
:content,
|
23
|
+
:failed_content,
|
24
|
+
:outputs,
|
25
|
+
:pages,
|
26
|
+
:page,
|
27
|
+
:save_pages,
|
28
|
+
:save_outputs,
|
29
|
+
:find_output,
|
30
|
+
:find_outputs,
|
31
|
+
:refetch,
|
32
|
+
:reparse
|
33
|
+
].freeze
|
34
|
+
end
|
35
|
+
|
36
|
+
def exec_parser(save=false)
|
37
|
+
@save = save
|
38
|
+
if save
|
39
|
+
puts "Executing parser script"
|
40
|
+
else
|
41
|
+
puts "Trying parser script"
|
42
|
+
end
|
43
|
+
|
44
|
+
eval_parser_script(save)
|
45
|
+
end
|
46
|
+
|
47
|
+
def init_page_vars(page)
|
48
|
+
if !@page_vars.nil? && !@page_vars.empty?
|
49
|
+
page['vars'] = @page_vars
|
50
|
+
end
|
51
|
+
page
|
52
|
+
end
|
53
|
+
|
54
|
+
def update_to_server(opts = {})
|
55
|
+
parsing_update(
|
56
|
+
job_id: opts[:job_id],
|
57
|
+
gid: opts[:gid],
|
58
|
+
pages: opts[:pages],
|
59
|
+
outputs: opts[:outputs],
|
60
|
+
parsing_status: opts[:status])
|
61
|
+
end
|
62
|
+
|
63
|
+
def update_parsing_starting_status
|
64
|
+
return unless save
|
65
|
+
|
66
|
+
response = parsing_update(
|
67
|
+
job_id: job_id,
|
68
|
+
gid: gid,
|
69
|
+
parsing_status: :starting)
|
70
|
+
|
71
|
+
if response.code == 200
|
72
|
+
puts "Page Parsing Status Updated."
|
73
|
+
else
|
74
|
+
puts "Error: Unable to save Page Parsing Status to server: #{response.body}"
|
75
|
+
raise "Unable to save Page Parsing Status to server: #{response.body}"
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def update_parsing_done_status
|
80
|
+
return unless save
|
81
|
+
|
82
|
+
response = parsing_update(
|
83
|
+
job_id: job_id,
|
84
|
+
gid: gid,
|
85
|
+
parsing_status: :done)
|
86
|
+
|
87
|
+
if response.code == 200
|
88
|
+
puts "Page Parsing Done."
|
89
|
+
else
|
90
|
+
puts "Error: Unable to save Page Parsing Done Status to server: #{response.body}"
|
91
|
+
raise "Unable to save Page Parsing Done Status to server: #{response.body}"
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def save_type
|
96
|
+
:parsing
|
97
|
+
end
|
98
|
+
|
99
|
+
def refetch_page gid
|
100
|
+
if save
|
101
|
+
Client::ScraperJobPage.new({gid: gid}).refetch_by_job(self.job_id)
|
102
|
+
puts "Refetch page #{gid}"
|
103
|
+
else
|
104
|
+
puts "Would have refetch page #{gid}"
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def refetch page_gid
|
109
|
+
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
|
110
|
+
if page_gid == gid
|
111
|
+
self.refetch_self = true
|
112
|
+
return
|
113
|
+
end
|
114
|
+
refetch_page page_gid
|
115
|
+
end
|
116
|
+
|
117
|
+
def reparse_page gid
|
118
|
+
if save
|
119
|
+
Client::ScraperJobPage.new({gid: gid}).reparse_by_job(self.job_id)
|
120
|
+
puts "Reparse page #{gid}"
|
121
|
+
else
|
122
|
+
puts "Would have reparse page #{gid}"
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def reparse page_gid
|
127
|
+
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
|
128
|
+
if page_gid == gid
|
129
|
+
self.reparse_self = true
|
130
|
+
return
|
131
|
+
end
|
132
|
+
reparse_page page_gid
|
133
|
+
end
|
134
|
+
|
135
|
+
def eval_parser_script(save=false)
|
136
|
+
update_parsing_starting_status
|
137
|
+
|
138
|
+
proc = Proc.new do
|
139
|
+
page = init_page
|
140
|
+
outputs = []
|
141
|
+
pages = []
|
142
|
+
page = init_page_vars(page)
|
143
|
+
self.refetch_self = false
|
144
|
+
self.reparse_self = false
|
145
|
+
|
146
|
+
begin
|
147
|
+
context = isolated_binding({
|
148
|
+
outputs: outputs,
|
149
|
+
pages: pages,
|
150
|
+
page: page
|
151
|
+
})
|
152
|
+
eval_with_context filename, context
|
153
|
+
rescue SyntaxError => e
|
154
|
+
handle_error(e) if save
|
155
|
+
raise e
|
156
|
+
rescue => e
|
157
|
+
handle_error(e) if save
|
158
|
+
raise e
|
159
|
+
end
|
160
|
+
|
161
|
+
puts "=========== Parsing Executed ==========="
|
162
|
+
begin
|
163
|
+
save_pages_and_outputs(pages, outputs, :parsing)
|
164
|
+
rescue => e
|
165
|
+
handle_error(e) if save
|
166
|
+
raise e
|
167
|
+
end
|
168
|
+
|
169
|
+
if refetch_self
|
170
|
+
refetch_page gid
|
171
|
+
elsif reparse_self
|
172
|
+
reparse_page gid
|
173
|
+
else
|
174
|
+
update_parsing_done_status
|
175
|
+
end
|
176
|
+
end
|
177
|
+
proc.call
|
178
|
+
end
|
179
|
+
|
180
|
+
def content
|
181
|
+
@content ||= get_content(gid)
|
182
|
+
end
|
183
|
+
|
184
|
+
def failed_content
|
185
|
+
@failed_content ||= get_failed_content(gid)
|
186
|
+
end
|
187
|
+
|
188
|
+
def handle_error(e)
|
189
|
+
error = ["Parsing #{e.class}: #{e.to_s} (Job:#{job_id} GID:#{gid})",clean_backtrace(e.backtrace)].join("\n")
|
190
|
+
|
191
|
+
parsing_update(
|
192
|
+
job_id: job_id,
|
193
|
+
gid: gid,
|
194
|
+
parsing_status: :failed,
|
195
|
+
log_error: error)
|
196
|
+
end
|
197
|
+
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
module Datahen
|
2
|
+
module Scraper
|
3
|
+
class RubySeederExecutor < Executor
|
4
|
+
attr_accessor :save
|
5
|
+
|
6
|
+
def initialize(options={})
|
7
|
+
@filename = options.fetch(:filename) { raise "Filename is required"}
|
8
|
+
@job_id = options[:job_id]
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.exposed_methods
|
12
|
+
[
|
13
|
+
:outputs,
|
14
|
+
:pages,
|
15
|
+
:save_pages,
|
16
|
+
:save_outputs,
|
17
|
+
:find_output,
|
18
|
+
:find_outputs
|
19
|
+
].freeze
|
20
|
+
end
|
21
|
+
|
22
|
+
def exec_seeder(save=false)
|
23
|
+
@save = save
|
24
|
+
if save
|
25
|
+
puts "Executing seeder script"
|
26
|
+
else
|
27
|
+
puts "Trying seeder script"
|
28
|
+
end
|
29
|
+
|
30
|
+
eval_seeder_script(save)
|
31
|
+
end
|
32
|
+
|
33
|
+
def eval_seeder_script(save=false)
|
34
|
+
update_seeding_starting_status
|
35
|
+
|
36
|
+
proc = Proc.new do
|
37
|
+
outputs = []
|
38
|
+
pages = []
|
39
|
+
|
40
|
+
begin
|
41
|
+
context = isolated_binding({
|
42
|
+
outputs: outputs,
|
43
|
+
pages: pages
|
44
|
+
})
|
45
|
+
eval_with_context filename, context
|
46
|
+
rescue SyntaxError => e
|
47
|
+
handle_error(e) if save
|
48
|
+
raise e
|
49
|
+
rescue => e
|
50
|
+
handle_error(e) if save
|
51
|
+
raise e
|
52
|
+
end
|
53
|
+
|
54
|
+
puts "=========== Seeding Executed ==========="
|
55
|
+
begin
|
56
|
+
save_pages_and_outputs(pages, outputs, :seeding)
|
57
|
+
rescue => e
|
58
|
+
handle_error(e) if save
|
59
|
+
raise e
|
60
|
+
end
|
61
|
+
|
62
|
+
update_seeding_done_status
|
63
|
+
end
|
64
|
+
proc.call
|
65
|
+
end
|
66
|
+
|
67
|
+
def save_type
|
68
|
+
:seeding
|
69
|
+
end
|
70
|
+
|
71
|
+
def update_to_server(opts = {})
|
72
|
+
seeding_update(
|
73
|
+
job_id: opts[:job_id],
|
74
|
+
pages: opts[:pages],
|
75
|
+
outputs: opts[:outputs],
|
76
|
+
seeding_status: opts[:status])
|
77
|
+
end
|
78
|
+
|
79
|
+
def update_seeding_starting_status
|
80
|
+
return unless save
|
81
|
+
|
82
|
+
response = seeding_update(
|
83
|
+
job_id: job_id,
|
84
|
+
seeding_status: :starting)
|
85
|
+
|
86
|
+
if response.code == 200
|
87
|
+
puts "Seeding Status Updated."
|
88
|
+
else
|
89
|
+
puts "Error: Unable to save Seeding Status to server: #{response.body}"
|
90
|
+
raise "Unable to save Seeding Status to server: #{response.body}"
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def update_seeding_done_status
|
95
|
+
return unless save
|
96
|
+
|
97
|
+
response = seeding_update(
|
98
|
+
job_id: job_id,
|
99
|
+
seeding_status: :done)
|
100
|
+
|
101
|
+
if response.code == 200
|
102
|
+
puts "Seeding Done."
|
103
|
+
else
|
104
|
+
puts "Error: Unable to save Seeding Done Status to server: #{response.body}"
|
105
|
+
raise "Unable to save Seeding Done Status to server: #{response.body}"
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def handle_error(e)
|
110
|
+
error = ["Seeding #{e.class}: #{e.to_s} (Job:#{job_id}",clean_backtrace(e.backtrace)].join("\n")
|
111
|
+
|
112
|
+
seeding_update(
|
113
|
+
job_id: job_id,
|
114
|
+
seeding_status: :failed,
|
115
|
+
log_error: error)
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|