answersengine 0.10.1 → 0.10.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CODE_OF_CONDUCT.md +1 -1
- data/LICENSE.txt +1 -1
- data/README.md +3 -4
- data/answersengine.gemspec +6 -12
- data/exe/answersengine +3 -2
- data/lib/answersengine.rb +20 -3
- metadata +14 -152
- data/examples/fetchtest/libraries/hello.rb +0 -9
- data/examples/fetchtest/libraries/hello_fail.rb +0 -10
- data/examples/fetchtest/parsers/failed.rb +0 -2
- data/examples/fetchtest/parsers/find_outputs.rb +0 -18
- data/examples/fetchtest/parsers/home.rb +0 -50
- data/examples/fetchtest/parsers/nested_fail.rb +0 -3
- data/examples/fetchtest/parsers/simple.rb +0 -14
- data/examples/fetchtest/seeders/csv_seeder.rb +0 -12
- data/examples/fetchtest/seeders/failed.rb +0 -1
- data/examples/fetchtest/seeders/list_of_urls.csv +0 -5
- data/examples/fetchtest/seeders/seed.rb +0 -28
- data/examples/fetchtest/seeders/test_reset_page.rb +0 -4
- data/lib/answersengine/cli.rb +0 -45
- data/lib/answersengine/cli/env_var.rb +0 -48
- data/lib/answersengine/cli/finisher.rb +0 -40
- data/lib/answersengine/cli/global_page.rb +0 -39
- data/lib/answersengine/cli/job.rb +0 -30
- data/lib/answersengine/cli/job_output.rb +0 -69
- data/lib/answersengine/cli/parser.rb +0 -64
- data/lib/answersengine/cli/scraper.rb +0 -185
- data/lib/answersengine/cli/scraper_deployment.rb +0 -24
- data/lib/answersengine/cli/scraper_export.rb +0 -51
- data/lib/answersengine/cli/scraper_exporter.rb +0 -40
- data/lib/answersengine/cli/scraper_finisher.rb +0 -20
- data/lib/answersengine/cli/scraper_job.rb +0 -75
- data/lib/answersengine/cli/scraper_job_var.rb +0 -48
- data/lib/answersengine/cli/scraper_page.rb +0 -203
- data/lib/answersengine/cli/scraper_var.rb +0 -48
- data/lib/answersengine/cli/seeder.rb +0 -40
- data/lib/answersengine/client.rb +0 -29
- data/lib/answersengine/client/auth_token.rb +0 -50
- data/lib/answersengine/client/backblaze_content.rb +0 -45
- data/lib/answersengine/client/base.rb +0 -55
- data/lib/answersengine/client/deploy_key.rb +0 -21
- data/lib/answersengine/client/env_var.rb +0 -28
- data/lib/answersengine/client/export.rb +0 -10
- data/lib/answersengine/client/global_page.rb +0 -18
- data/lib/answersengine/client/job.rb +0 -64
- data/lib/answersengine/client/job_export.rb +0 -10
- data/lib/answersengine/client/job_log.rb +0 -26
- data/lib/answersengine/client/job_output.rb +0 -19
- data/lib/answersengine/client/job_page.rb +0 -58
- data/lib/answersengine/client/job_stat.rb +0 -16
- data/lib/answersengine/client/scraper.rb +0 -57
- data/lib/answersengine/client/scraper_deployment.rb +0 -18
- data/lib/answersengine/client/scraper_export.rb +0 -22
- data/lib/answersengine/client/scraper_exporter.rb +0 -14
- data/lib/answersengine/client/scraper_finisher.rb +0 -16
- data/lib/answersengine/client/scraper_job.rb +0 -49
- data/lib/answersengine/client/scraper_job_output.rb +0 -19
- data/lib/answersengine/client/scraper_job_page.rb +0 -67
- data/lib/answersengine/client/scraper_job_var.rb +0 -28
- data/lib/answersengine/client/scraper_var.rb +0 -28
- data/lib/answersengine/plugin.rb +0 -6
- data/lib/answersengine/plugin/context_exposer.rb +0 -55
- data/lib/answersengine/scraper.rb +0 -18
- data/lib/answersengine/scraper/executor.rb +0 -373
- data/lib/answersengine/scraper/finisher.rb +0 -18
- data/lib/answersengine/scraper/parser.rb +0 -18
- data/lib/answersengine/scraper/ruby_finisher_executor.rb +0 -116
- data/lib/answersengine/scraper/ruby_parser_executor.rb +0 -200
- data/lib/answersengine/scraper/ruby_seeder_executor.rb +0 -120
- data/lib/answersengine/scraper/seeder.rb +0 -18
- data/lib/answersengine/version.rb +0 -3
@@ -1,200 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
module Scraper
|
3
|
-
class RubyParserExecutor < Executor
|
4
|
-
attr_accessor :save
|
5
|
-
# Refetch self page flag.
|
6
|
-
# @return [Boollean]
|
7
|
-
# @note It is stronger than #reparse_self flag.
|
8
|
-
attr_accessor :refetch_self
|
9
|
-
# Reparse self page flag.
|
10
|
-
# @return [Boollean]
|
11
|
-
attr_accessor :reparse_self
|
12
|
-
|
13
|
-
def initialize(options={})
|
14
|
-
@filename = options.fetch(:filename) { raise "Filename is required"}
|
15
|
-
@gid = options.fetch(:gid) { raise "GID is required"}
|
16
|
-
@job_id = options.fetch(:job_id)
|
17
|
-
@page_vars = options.fetch(:vars) { {} }
|
18
|
-
end
|
19
|
-
|
20
|
-
def self.exposed_methods
|
21
|
-
[
|
22
|
-
:content,
|
23
|
-
:failed_content,
|
24
|
-
:outputs,
|
25
|
-
:pages,
|
26
|
-
:page,
|
27
|
-
:save_pages,
|
28
|
-
:save_outputs,
|
29
|
-
:find_output,
|
30
|
-
:find_outputs,
|
31
|
-
:refetch,
|
32
|
-
:reparse
|
33
|
-
].freeze
|
34
|
-
end
|
35
|
-
|
36
|
-
def exec_parser(save=false)
|
37
|
-
@save = save
|
38
|
-
if save
|
39
|
-
puts "Executing parser script"
|
40
|
-
else
|
41
|
-
puts "Trying parser script"
|
42
|
-
end
|
43
|
-
|
44
|
-
eval_parser_script(save)
|
45
|
-
end
|
46
|
-
|
47
|
-
def init_page_vars(page)
|
48
|
-
if !@page_vars.nil? && !@page_vars.empty?
|
49
|
-
page['vars'] = @page_vars
|
50
|
-
end
|
51
|
-
page
|
52
|
-
end
|
53
|
-
|
54
|
-
def update_to_server(opts = {})
|
55
|
-
parsing_update(
|
56
|
-
job_id: opts[:job_id],
|
57
|
-
gid: opts[:gid],
|
58
|
-
pages: opts[:pages],
|
59
|
-
outputs: opts[:outputs],
|
60
|
-
parsing_status: opts[:status])
|
61
|
-
end
|
62
|
-
|
63
|
-
def update_parsing_starting_status
|
64
|
-
return unless save
|
65
|
-
|
66
|
-
response = parsing_update(
|
67
|
-
job_id: job_id,
|
68
|
-
gid: gid,
|
69
|
-
parsing_status: :starting)
|
70
|
-
|
71
|
-
if response.code == 200
|
72
|
-
puts "Page Parsing Status Updated."
|
73
|
-
else
|
74
|
-
puts "Error: Unable to save Page Parsing Status to server: #{response.body}"
|
75
|
-
raise "Unable to save Page Parsing Status to server: #{response.body}"
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
def update_parsing_done_status
|
80
|
-
return unless save
|
81
|
-
|
82
|
-
response = parsing_update(
|
83
|
-
job_id: job_id,
|
84
|
-
gid: gid,
|
85
|
-
parsing_status: :done)
|
86
|
-
|
87
|
-
if response.code == 200
|
88
|
-
puts "Page Parsing Done."
|
89
|
-
else
|
90
|
-
puts "Error: Unable to save Page Parsing Done Status to server: #{response.body}"
|
91
|
-
raise "Unable to save Page Parsing Done Status to server: #{response.body}"
|
92
|
-
end
|
93
|
-
end
|
94
|
-
|
95
|
-
def save_type
|
96
|
-
:parsing
|
97
|
-
end
|
98
|
-
|
99
|
-
def refetch_page gid
|
100
|
-
if save
|
101
|
-
Client::ScraperJobPage.new({gid: gid}).refetch_by_job(self.job_id)
|
102
|
-
puts "Refetch page #{gid}"
|
103
|
-
else
|
104
|
-
puts "Would have refetch page #{gid}"
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
def refetch page_gid
|
109
|
-
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
|
110
|
-
if page_gid == gid
|
111
|
-
self.refetch_self = true
|
112
|
-
return
|
113
|
-
end
|
114
|
-
refetch_page page_gid
|
115
|
-
end
|
116
|
-
|
117
|
-
def reparse_page gid
|
118
|
-
if save
|
119
|
-
Client::ScraperJobPage.new({gid: gid}).reparse_by_job(self.job_id)
|
120
|
-
puts "Reparse page #{gid}"
|
121
|
-
else
|
122
|
-
puts "Would have reparse page #{gid}"
|
123
|
-
end
|
124
|
-
end
|
125
|
-
|
126
|
-
def reparse page_gid
|
127
|
-
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
|
128
|
-
if page_gid == gid
|
129
|
-
self.reparse_self = true
|
130
|
-
return
|
131
|
-
end
|
132
|
-
reparse_page page_gid
|
133
|
-
end
|
134
|
-
|
135
|
-
def eval_parser_script(save=false)
|
136
|
-
update_parsing_starting_status
|
137
|
-
|
138
|
-
proc = Proc.new do
|
139
|
-
page = init_page
|
140
|
-
outputs = []
|
141
|
-
pages = []
|
142
|
-
page = init_page_vars(page)
|
143
|
-
self.refetch_self = false
|
144
|
-
self.reparse_self = false
|
145
|
-
|
146
|
-
begin
|
147
|
-
context = isolated_binding({
|
148
|
-
outputs: outputs,
|
149
|
-
pages: pages,
|
150
|
-
page: page
|
151
|
-
})
|
152
|
-
eval_with_context filename, context
|
153
|
-
rescue SyntaxError => e
|
154
|
-
handle_error(e) if save
|
155
|
-
raise e
|
156
|
-
rescue => e
|
157
|
-
handle_error(e) if save
|
158
|
-
raise e
|
159
|
-
end
|
160
|
-
|
161
|
-
puts "=========== Parsing Executed ==========="
|
162
|
-
begin
|
163
|
-
save_pages_and_outputs(pages, outputs, :parsing)
|
164
|
-
rescue => e
|
165
|
-
handle_error(e) if save
|
166
|
-
raise e
|
167
|
-
end
|
168
|
-
|
169
|
-
if refetch_self
|
170
|
-
refetch_page gid
|
171
|
-
elsif reparse_self
|
172
|
-
reparse_page gid
|
173
|
-
else
|
174
|
-
update_parsing_done_status
|
175
|
-
end
|
176
|
-
end
|
177
|
-
proc.call
|
178
|
-
end
|
179
|
-
|
180
|
-
def content
|
181
|
-
@content ||= get_content(gid)
|
182
|
-
end
|
183
|
-
|
184
|
-
def failed_content
|
185
|
-
@failed_content ||= get_failed_content(gid)
|
186
|
-
end
|
187
|
-
|
188
|
-
def handle_error(e)
|
189
|
-
error = ["Parsing #{e.class}: #{e.to_s} (Job:#{job_id} GID:#{gid})",clean_backtrace(e.backtrace)].join("\n")
|
190
|
-
|
191
|
-
parsing_update(
|
192
|
-
job_id: job_id,
|
193
|
-
gid: gid,
|
194
|
-
parsing_status: :failed,
|
195
|
-
log_error: error)
|
196
|
-
end
|
197
|
-
|
198
|
-
end
|
199
|
-
end
|
200
|
-
end
|
@@ -1,120 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
module Scraper
|
3
|
-
class RubySeederExecutor < Executor
|
4
|
-
attr_accessor :save
|
5
|
-
|
6
|
-
def initialize(options={})
|
7
|
-
@filename = options.fetch(:filename) { raise "Filename is required"}
|
8
|
-
@job_id = options[:job_id]
|
9
|
-
end
|
10
|
-
|
11
|
-
def self.exposed_methods
|
12
|
-
[
|
13
|
-
:outputs,
|
14
|
-
:pages,
|
15
|
-
:save_pages,
|
16
|
-
:save_outputs,
|
17
|
-
:find_output,
|
18
|
-
:find_outputs
|
19
|
-
].freeze
|
20
|
-
end
|
21
|
-
|
22
|
-
def exec_seeder(save=false)
|
23
|
-
@save = save
|
24
|
-
if save
|
25
|
-
puts "Executing seeder script"
|
26
|
-
else
|
27
|
-
puts "Trying seeder script"
|
28
|
-
end
|
29
|
-
|
30
|
-
eval_seeder_script(save)
|
31
|
-
end
|
32
|
-
|
33
|
-
def eval_seeder_script(save=false)
|
34
|
-
update_seeding_starting_status
|
35
|
-
|
36
|
-
proc = Proc.new do
|
37
|
-
outputs = []
|
38
|
-
pages = []
|
39
|
-
|
40
|
-
begin
|
41
|
-
context = isolated_binding({
|
42
|
-
outputs: outputs,
|
43
|
-
pages: pages
|
44
|
-
})
|
45
|
-
eval_with_context filename, context
|
46
|
-
rescue SyntaxError => e
|
47
|
-
handle_error(e) if save
|
48
|
-
raise e
|
49
|
-
rescue => e
|
50
|
-
handle_error(e) if save
|
51
|
-
raise e
|
52
|
-
end
|
53
|
-
|
54
|
-
puts "=========== Seeding Executed ==========="
|
55
|
-
begin
|
56
|
-
save_pages_and_outputs(pages, outputs, :seeding)
|
57
|
-
rescue => e
|
58
|
-
handle_error(e) if save
|
59
|
-
raise e
|
60
|
-
end
|
61
|
-
|
62
|
-
update_seeding_done_status
|
63
|
-
end
|
64
|
-
proc.call
|
65
|
-
end
|
66
|
-
|
67
|
-
def save_type
|
68
|
-
:seeding
|
69
|
-
end
|
70
|
-
|
71
|
-
def update_to_server(opts = {})
|
72
|
-
seeding_update(
|
73
|
-
job_id: opts[:job_id],
|
74
|
-
pages: opts[:pages],
|
75
|
-
outputs: opts[:outputs],
|
76
|
-
seeding_status: opts[:status])
|
77
|
-
end
|
78
|
-
|
79
|
-
def update_seeding_starting_status
|
80
|
-
return unless save
|
81
|
-
|
82
|
-
response = seeding_update(
|
83
|
-
job_id: job_id,
|
84
|
-
seeding_status: :starting)
|
85
|
-
|
86
|
-
if response.code == 200
|
87
|
-
puts "Seeding Status Updated."
|
88
|
-
else
|
89
|
-
puts "Error: Unable to save Seeding Status to server: #{response.body}"
|
90
|
-
raise "Unable to save Seeding Status to server: #{response.body}"
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
def update_seeding_done_status
|
95
|
-
return unless save
|
96
|
-
|
97
|
-
response = seeding_update(
|
98
|
-
job_id: job_id,
|
99
|
-
seeding_status: :done)
|
100
|
-
|
101
|
-
if response.code == 200
|
102
|
-
puts "Seeding Done."
|
103
|
-
else
|
104
|
-
puts "Error: Unable to save Seeding Done Status to server: #{response.body}"
|
105
|
-
raise "Unable to save Seeding Done Status to server: #{response.body}"
|
106
|
-
end
|
107
|
-
end
|
108
|
-
|
109
|
-
def handle_error(e)
|
110
|
-
error = ["Seeding #{e.class}: #{e.to_s} (Job:#{job_id}",clean_backtrace(e.backtrace)].join("\n")
|
111
|
-
|
112
|
-
seeding_update(
|
113
|
-
job_id: job_id,
|
114
|
-
seeding_status: :failed,
|
115
|
-
log_error: error)
|
116
|
-
end
|
117
|
-
|
118
|
-
end
|
119
|
-
end
|
120
|
-
end
|
@@ -1,18 +0,0 @@
|
|
1
|
-
module AnswersEngine
|
2
|
-
module Scraper
|
3
|
-
class Seeder
|
4
|
-
|
5
|
-
def self.exec_seeder(filename, job_id=nil, save=false)
|
6
|
-
extname = File.extname(filename)
|
7
|
-
case extname
|
8
|
-
when '.rb'
|
9
|
-
executor = RubySeederExecutor.new(filename: filename, job_id: job_id)
|
10
|
-
executor.exec_seeder(save)
|
11
|
-
else
|
12
|
-
puts "Unable to find a seeder executor for file type \"#{extname}\""
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|