answersengine 0.2.33
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.travis.yml +7 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +30 -0
- data/Rakefile +22 -0
- data/answersengine.gemspec +45 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/examples/fetchtest/libraries/hello.rb +9 -0
- data/examples/fetchtest/libraries/hello_fail.rb +10 -0
- data/examples/fetchtest/parsers/failed.rb +2 -0
- data/examples/fetchtest/parsers/find_outputs.rb +18 -0
- data/examples/fetchtest/parsers/home.rb +50 -0
- data/examples/fetchtest/parsers/nested_fail.rb +3 -0
- data/examples/fetchtest/parsers/simple.rb +14 -0
- data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
- data/examples/fetchtest/seeders/failed.rb +1 -0
- data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
- data/examples/fetchtest/seeders/seed.rb +28 -0
- data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
- data/exe/answersengine +3 -0
- data/lib/answersengine.rb +5 -0
- data/lib/answersengine/cli.rb +33 -0
- data/lib/answersengine/cli/global_page.rb +39 -0
- data/lib/answersengine/cli/job.rb +30 -0
- data/lib/answersengine/cli/job_output.rb +69 -0
- data/lib/answersengine/cli/parser.rb +64 -0
- data/lib/answersengine/cli/scraper.rb +172 -0
- data/lib/answersengine/cli/scraper_deployment.rb +24 -0
- data/lib/answersengine/cli/scraper_export.rb +51 -0
- data/lib/answersengine/cli/scraper_exporter.rb +40 -0
- data/lib/answersengine/cli/scraper_job.rb +71 -0
- data/lib/answersengine/cli/scraper_page.rb +200 -0
- data/lib/answersengine/cli/seeder.rb +40 -0
- data/lib/answersengine/client.rb +23 -0
- data/lib/answersengine/client/backblaze_content.rb +45 -0
- data/lib/answersengine/client/base.rb +50 -0
- data/lib/answersengine/client/export.rb +10 -0
- data/lib/answersengine/client/global_page.rb +18 -0
- data/lib/answersengine/client/job.rb +53 -0
- data/lib/answersengine/client/job_export.rb +10 -0
- data/lib/answersengine/client/job_log.rb +27 -0
- data/lib/answersengine/client/job_output.rb +19 -0
- data/lib/answersengine/client/job_page.rb +62 -0
- data/lib/answersengine/client/job_stat.rb +16 -0
- data/lib/answersengine/client/scraper.rb +54 -0
- data/lib/answersengine/client/scraper_deployment.rb +17 -0
- data/lib/answersengine/client/scraper_export.rb +22 -0
- data/lib/answersengine/client/scraper_exporter.rb +14 -0
- data/lib/answersengine/client/scraper_job.rb +49 -0
- data/lib/answersengine/client/scraper_job_output.rb +19 -0
- data/lib/answersengine/client/scraper_job_page.rb +55 -0
- data/lib/answersengine/plugin.rb +6 -0
- data/lib/answersengine/plugin/context_exposer.rb +55 -0
- data/lib/answersengine/scraper.rb +16 -0
- data/lib/answersengine/scraper/executor.rb +292 -0
- data/lib/answersengine/scraper/parser.rb +18 -0
- data/lib/answersengine/scraper/ruby_parser_executor.rb +141 -0
- data/lib/answersengine/scraper/ruby_seeder_executor.rb +114 -0
- data/lib/answersengine/scraper/seeder.rb +18 -0
- data/lib/answersengine/version.rb +3 -0
- metadata +255 -0
@@ -0,0 +1,51 @@
|
|
1
|
+
module AnswersEngine
|
2
|
+
class CLI < Thor
|
3
|
+
class ScraperExport < Thor
|
4
|
+
package_name "scraper export"
|
5
|
+
def self.banner(command, namespace = nil, subcommand = false)
|
6
|
+
"#{basename} #{@package_name} #{command.usage}"
|
7
|
+
end
|
8
|
+
|
9
|
+
desc "show <export_id>", "Show a scraper's export"
|
10
|
+
def show(export_id)
|
11
|
+
client = Client::ScraperExport.new(options)
|
12
|
+
puts "#{client.find(export_id)}"
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
desc "list", "Gets a list of exports"
|
17
|
+
long_desc <<-LONGDESC
|
18
|
+
List exports.
|
19
|
+
LONGDESC
|
20
|
+
option :scraper_name, :aliases => :s, type: :string, desc: 'Filter by a specific scraper_name'
|
21
|
+
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
22
|
+
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
23
|
+
def list()
|
24
|
+
if options[:scraper_name]
|
25
|
+
client = Client::ScraperExport.new(options)
|
26
|
+
puts "#{client.all(options[:scraper_name])}"
|
27
|
+
else
|
28
|
+
client = Client::Export.new(options)
|
29
|
+
puts "#{client.all}"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
desc "download <export_id>", "Download the exported file"
|
34
|
+
def download(export_id)
|
35
|
+
client = Client::ScraperExport.new(options)
|
36
|
+
result = JSON.parse(client.download(export_id).to_s)
|
37
|
+
|
38
|
+
if result['signed_url']
|
39
|
+
puts "Download url: \"#{result['signed_url']}\""
|
40
|
+
`open "#{result['signed_url']}"`
|
41
|
+
else
|
42
|
+
puts "Exported file does not exist"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module AnswersEngine
|
2
|
+
class CLI < Thor
|
3
|
+
class ScraperExporter < Thor
|
4
|
+
package_name "scraper exporter"
|
5
|
+
def self.banner(command, namespace = nil, subcommand = false)
|
6
|
+
"#{basename} #{@package_name} #{command.usage}"
|
7
|
+
end
|
8
|
+
|
9
|
+
desc "show <scraper_name> <exporter_name>", "Show a scraper's exporter"
|
10
|
+
def show(scraper_name, exporter_name)
|
11
|
+
client = Client::ScraperExporter.new(options)
|
12
|
+
puts "#{client.find(scraper_name, exporter_name)}"
|
13
|
+
end
|
14
|
+
|
15
|
+
desc "start <scraper_name> <exporter_name>", "Starts an export"
|
16
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
17
|
+
def start(scraper_name, exporter_name)
|
18
|
+
if options[:job]
|
19
|
+
client = Client::JobExport.new(options)
|
20
|
+
puts "#{client.create(options[:job], exporter_name)}"
|
21
|
+
else
|
22
|
+
client = Client::ScraperExport.new(options)
|
23
|
+
puts "#{client.create(scraper_name, exporter_name)}"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
desc "list <scraper_name>", "gets a list of exporters on a scraper"
|
28
|
+
long_desc <<-LONGDESC
|
29
|
+
List exporters on a scraper.
|
30
|
+
LONGDESC
|
31
|
+
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
32
|
+
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
33
|
+
def list(scraper_name)
|
34
|
+
client = Client::ScraperExporter.new(options)
|
35
|
+
puts "#{client.all(scraper_name)}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module AnswersEngine
|
2
|
+
class CLI < Thor
|
3
|
+
class ScraperJob < Thor
|
4
|
+
package_name "scraper job"
|
5
|
+
def self.banner(command, namespace = nil, subcommand = false)
|
6
|
+
"#{basename} #{@package_name} #{command.usage}"
|
7
|
+
end
|
8
|
+
|
9
|
+
desc "show <scraper_name>", "Show a scraper's current job"
|
10
|
+
def show(scraper_name)
|
11
|
+
client = Client::ScraperJob.new(options)
|
12
|
+
puts "#{client.find(scraper_name)}"
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
desc "list <scraper_name>", "gets a list of jobs on a scraper"
|
17
|
+
long_desc <<-LONGDESC
|
18
|
+
List jobs on a scraper.
|
19
|
+
LONGDESC
|
20
|
+
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
21
|
+
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
22
|
+
def list(scraper_name)
|
23
|
+
client = Client::ScraperJob.new(options)
|
24
|
+
puts "#{client.all(scraper_name)}"
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
desc "cancel <scraper_name>", "cancels a scraper's current job"
|
29
|
+
long_desc <<-LONGDESC
|
30
|
+
Cancels a scraper's current job
|
31
|
+
LONGDESC
|
32
|
+
def cancel(scraper_name)
|
33
|
+
client = Client::ScraperJob.new(options)
|
34
|
+
puts "#{client.cancel(scraper_name)}"
|
35
|
+
end
|
36
|
+
|
37
|
+
desc "resume <scraper_name>", "resumes a scraper's current job"
|
38
|
+
long_desc <<-LONGDESC
|
39
|
+
Resumes a scraper's current job
|
40
|
+
LONGDESC
|
41
|
+
def resume(scraper_name)
|
42
|
+
client = Client::ScraperJob.new(options)
|
43
|
+
puts "#{client.resume(scraper_name)}"
|
44
|
+
end
|
45
|
+
|
46
|
+
desc "pause <scraper_name>", "pauses a scraper's current job"
|
47
|
+
long_desc <<-LONGDESC
|
48
|
+
pauses a scraper's current job
|
49
|
+
LONGDESC
|
50
|
+
def pause(scraper_name)
|
51
|
+
client = Client::ScraperJob.new(options)
|
52
|
+
puts "#{client.pause(scraper_name)}"
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
desc "update <scraper_name>", "updates a scraper's current job"
|
57
|
+
long_desc <<-LONGDESC
|
58
|
+
Updates a scraper's current job.
|
59
|
+
LONGDESC
|
60
|
+
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 1. '
|
61
|
+
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 0. '
|
62
|
+
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
63
|
+
def update(scraper_name)
|
64
|
+
client = Client::ScraperJob.new(options)
|
65
|
+
puts "#{client.update(scraper_name, options)}"
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
@@ -0,0 +1,200 @@
|
|
1
|
+
module AnswersEngine
|
2
|
+
class CLI < Thor
|
3
|
+
class ScraperPage < Thor
|
4
|
+
|
5
|
+
package_name "scraper page"
|
6
|
+
def self.banner(command, namespace = nil, subcommand = false)
|
7
|
+
"#{basename} #{@package_name} #{command.usage}"
|
8
|
+
end
|
9
|
+
|
10
|
+
desc "list <scraper_name>", "List Pages on a scraper's current job"
|
11
|
+
long_desc <<-LONGDESC
|
12
|
+
List all pages in a scraper's current job.\x5
|
13
|
+
LONGDESC
|
14
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
15
|
+
option :page_type, :aliases => :t, type: :string, desc: 'Filter by page_type'
|
16
|
+
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
17
|
+
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
18
|
+
option :fetch_fail, type: :boolean, desc: 'Returns only pages that fails fetching.'
|
19
|
+
option :parse_fail, type: :boolean, desc: 'Returns only pages that fails parsing.'
|
20
|
+
def list(scraper_name)
|
21
|
+
if options[:job]
|
22
|
+
client = Client::JobPage.new(options)
|
23
|
+
puts "#{client.all(options[:job])}"
|
24
|
+
else
|
25
|
+
client = Client::ScraperJobPage.new(options)
|
26
|
+
puts "#{client.all(scraper_name)}"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
desc "add <scraper_name> <url>", "Enqueues a page to a scraper's current job"
|
31
|
+
long_desc <<-LONGDESC
|
32
|
+
Enqueues a page to a scraper's current job\x5
|
33
|
+
LONGDESC
|
34
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
35
|
+
option :method, :aliases => :m, type: :string, desc: 'Set request method. Default: GET'
|
36
|
+
option :headers, :aliases => :H, type: :string, banner: :JSON, desc: 'Set request headers. Must be in json format. i.e: {"Foo":"bar"} '
|
37
|
+
option :cookie, :aliases => :c, type: :string, desc: 'Set request cookie.'
|
38
|
+
option :vars, :aliases => :v, type: :string, banner: :JSON, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
39
|
+
option :page_type, :aliases => :t, desc: 'Set page type'
|
40
|
+
option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
|
41
|
+
option :fetch_type, :aliases => :F, desc: 'Set fetch type. Default: http'
|
42
|
+
option :body, :aliases => :b, desc: 'Set request body'
|
43
|
+
option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
|
44
|
+
option :freshness, :aliases => :s, desc: 'Set how fresh the page cache is. Accepts timestap format.'
|
45
|
+
option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
|
46
|
+
option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
|
47
|
+
def add(scraper_name, url)
|
48
|
+
begin
|
49
|
+
options[:headers] = JSON.parse(options[:headers]) if options[:headers]
|
50
|
+
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
51
|
+
method = options[:method]
|
52
|
+
|
53
|
+
if options[:job]
|
54
|
+
client = Client::JobPage.new(options)
|
55
|
+
puts "#{client.enqueue(options[:job], method, url, options)}"
|
56
|
+
else
|
57
|
+
client = Client::ScraperJobPage.new(options)
|
58
|
+
puts "#{client.enqueue(scraper_name, method, url, options)}"
|
59
|
+
end
|
60
|
+
|
61
|
+
rescue JSON::ParserError
|
62
|
+
if options[:headers]
|
63
|
+
puts "Error: #{options[:headers]} on headers is not a valid JSON"
|
64
|
+
end
|
65
|
+
if options[:vars]
|
66
|
+
puts "Error: #{options[:vars]} on vars is not a valid JSON"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
desc "update <scraper_name> <gid>", "Update a page in a scraper's current job"
|
73
|
+
long_desc <<-LONGDESC
|
74
|
+
Updates a page in a scraper's current job. Only page_type or page vars is updateable.\x5
|
75
|
+
LONGDESC
|
76
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
77
|
+
option :page_type, :aliases => :t, desc: 'Set page type'
|
78
|
+
option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
|
79
|
+
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
80
|
+
def update(scraper_name, gid)
|
81
|
+
begin
|
82
|
+
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
83
|
+
|
84
|
+
if options[:job]
|
85
|
+
client = Client::JobPage.new(options)
|
86
|
+
puts "#{client.update(options[:job], gid, options)}"
|
87
|
+
else
|
88
|
+
client = Client::ScraperJobPage.new(options)
|
89
|
+
puts "#{client.update(scraper_name, gid, options)}"
|
90
|
+
end
|
91
|
+
|
92
|
+
rescue JSON::ParserError
|
93
|
+
if options[:vars]
|
94
|
+
puts "Error: #{options[:vars]} on vars is not a valid JSON"
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
desc "refetch <scraper_name>", "Refetch Pages on a scraper's current job"
|
100
|
+
long_desc <<-LONGDESC
|
101
|
+
Refetch pages in a scraper's current job. You need to specify either a --gid or --fetch-fail or --parse-fail.\x5
|
102
|
+
LONGDESC
|
103
|
+
option :gid, :aliases => :g, type: :string, desc: 'Refetch a specific GID'
|
104
|
+
option :fetch_fail, type: :boolean, desc: 'Refetches only pages that fails fetching.'
|
105
|
+
option :parse_fail, type: :boolean, desc: 'Refetches only pages that fails parsing.'
|
106
|
+
def refetch(scraper_name)
|
107
|
+
if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail)
|
108
|
+
puts "Must specify either a --gid or --fetch-fail or --parse-fail"
|
109
|
+
else
|
110
|
+
client = Client::ScraperJobPage.new(options)
|
111
|
+
puts "#{client.refetch(scraper_name)}"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
desc "reset <scraper_name> <gid>", "Reset fetching and parsing of a page in a scraper's current job"
|
116
|
+
long_desc <<-LONGDESC
|
117
|
+
Reset fetching and parsing of a page in a scraper's current job.\x5
|
118
|
+
LONGDESC
|
119
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
120
|
+
def reset(scraper_name, gid)
|
121
|
+
begin
|
122
|
+
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
123
|
+
|
124
|
+
if options[:job]
|
125
|
+
client = Client::JobPage.new(options)
|
126
|
+
puts "#{client.reset(options[:job], gid, options)}"
|
127
|
+
else
|
128
|
+
client = Client::ScraperJobPage.new(options)
|
129
|
+
puts "#{client.reset(scraper_name, gid, options)}"
|
130
|
+
end
|
131
|
+
|
132
|
+
rescue JSON::ParserError
|
133
|
+
if options[:vars]
|
134
|
+
puts "Error: #{options[:vars]} on vars is not a valid JSON"
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
desc "show <scraper_name> <gid>", "Show a page in scraper's current job"
|
140
|
+
long_desc <<-LONGDESC
|
141
|
+
Shows a page in a scraper's current job.\x5
|
142
|
+
LONGDESC
|
143
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
144
|
+
def show(scraper_name, gid)
|
145
|
+
if options[:job]
|
146
|
+
client = Client::JobPage.new(options)
|
147
|
+
puts "#{client.find(options[:job], gid)}"
|
148
|
+
else
|
149
|
+
client = Client::ScraperJobPage.new(options)
|
150
|
+
puts "#{client.find(scraper_name, gid)}"
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
desc "log <scraper_name> <gid>", "List log entries related to a job page"
|
155
|
+
long_desc <<-LONGDESC
|
156
|
+
Shows log related to a page in the job. Defaults to showing the most recent entries\x5
|
157
|
+
LONGDESC
|
158
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
159
|
+
option :head, :aliases => :H, desc: 'Show the oldest log entries. If not set, newest entries is shown'
|
160
|
+
option :parsing, :aliases => :p, type: :boolean, desc: 'Show only log entries related to parsing'
|
161
|
+
option :more, :aliases => :m, desc: 'Show next set of log entries. Enter the `More token`'
|
162
|
+
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 5000 per page.'
|
163
|
+
def log(scraper_name, gid)
|
164
|
+
client = Client::JobLog.new(options)
|
165
|
+
|
166
|
+
query = {}
|
167
|
+
query["order"] = options.delete(:head) if options[:head]
|
168
|
+
query["job_type"] = "parsing" if options[:parsing]
|
169
|
+
|
170
|
+
query["page_token"] = options.delete(:more) if options[:more]
|
171
|
+
query["per_page"] = options.delete(:per_page) if options[:per_page]
|
172
|
+
|
173
|
+
puts "Fetching page logs..."
|
174
|
+
|
175
|
+
if options[:job]
|
176
|
+
result = client.all_job_page_log(options[:job], gid, {query: query})
|
177
|
+
else
|
178
|
+
result = client.scraper_all_job_page_log(scraper_name, gid, {query: query})
|
179
|
+
end
|
180
|
+
|
181
|
+
if result['entries'].nil? || result["entries"].length == 0
|
182
|
+
puts "No logs yet, please try again later."
|
183
|
+
else
|
184
|
+
|
185
|
+
more_token = result["more_token"]
|
186
|
+
|
187
|
+
result["entries"].each do |entry|
|
188
|
+
puts "#{entry["timestamp"]} #{entry["severity"]}: #{entry["payload"]}" if entry.is_a?(Hash)
|
189
|
+
end
|
190
|
+
|
191
|
+
unless more_token.nil?
|
192
|
+
puts "to see more entries, add: \"--more #{more_token}\""
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module AnswersEngine
|
2
|
+
class CLI < Thor
|
3
|
+
class Seeder < Thor
|
4
|
+
desc "try <scraper_name> <seeder_file>", "Tries a seeder file"
|
5
|
+
long_desc <<-LONGDESC
|
6
|
+
Takes a seeder script and tries to execute it without saving anything.\x5
|
7
|
+
<seeder_file>: Seeder script file will be executed.\x5
|
8
|
+
LONGDESC
|
9
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
10
|
+
def try_seed(scraper_name, seeder_file)
|
11
|
+
if options[:job]
|
12
|
+
job_id = options[:job]
|
13
|
+
else
|
14
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
15
|
+
job_id = job['id']
|
16
|
+
end
|
17
|
+
|
18
|
+
puts AnswersEngine::Scraper::Seeder.exec_seeder(seeder_file, job_id, false)
|
19
|
+
end
|
20
|
+
|
21
|
+
desc "exec <scraper_name> <seeder_file>", "Executes a seeder script onto a scraper's current job."
|
22
|
+
long_desc <<-LONGDESC
|
23
|
+
Takes a seeder script and execute it against a job and enqueues the pages into the scraper's current job\x5
|
24
|
+
<seeder_file>: Seeder script file that will be executed on the scraper's current job.\x5
|
25
|
+
LONGDESC
|
26
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
27
|
+
def exec_parse(scraper_name, seeder_file)
|
28
|
+
if options[:job]
|
29
|
+
job_id = options[:job]
|
30
|
+
else
|
31
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
32
|
+
job_id = job['id']
|
33
|
+
end
|
34
|
+
|
35
|
+
puts AnswersEngine::Scraper::Seeder.exec_seeder(seeder_file, job_id, true)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require "answersengine/client/base"
|
2
|
+
require 'answersengine/client/export'
|
3
|
+
require "answersengine/client/scraper"
|
4
|
+
require "answersengine/client/scraper_deployment"
|
5
|
+
require "answersengine/client/scraper_job_output"
|
6
|
+
require "answersengine/client/scraper_job_page"
|
7
|
+
require "answersengine/client/scraper_exporter"
|
8
|
+
require "answersengine/client/scraper_export"
|
9
|
+
require "answersengine/client/scraper_job"
|
10
|
+
require 'answersengine/client/job_export'
|
11
|
+
require "answersengine/client/job"
|
12
|
+
require "answersengine/client/job_log"
|
13
|
+
require "answersengine/client/global_page"
|
14
|
+
require "answersengine/client/job_page"
|
15
|
+
require "answersengine/client/job_output"
|
16
|
+
require "answersengine/client/job_stat"
|
17
|
+
require "answersengine/client/backblaze_content"
|
18
|
+
|
19
|
+
|
20
|
+
module AnswersEngine
|
21
|
+
module Client
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'zlib'
|
2
|
+
require 'httparty'
|
3
|
+
|
4
|
+
module AnswersEngine
|
5
|
+
module Client
|
6
|
+
class BackblazeContent
|
7
|
+
include HTTParty
|
8
|
+
|
9
|
+
def get_content(url)
|
10
|
+
self.class.get(url, format: :plain)
|
11
|
+
end
|
12
|
+
|
13
|
+
def get_gunzipped_content(url)
|
14
|
+
# Zlib.gunzip(get_content(url))
|
15
|
+
gunzip(get_content(url))
|
16
|
+
end
|
17
|
+
|
18
|
+
def gunzip(string)
|
19
|
+
sio = StringIO.new(string)
|
20
|
+
gz = Zlib::GzipReader.new(sio, encoding: Encoding::ASCII_8BIT)
|
21
|
+
_content = ""
|
22
|
+
begin
|
23
|
+
_content = gz.read
|
24
|
+
rescue => e
|
25
|
+
# if unexpected eof error, then readchar until error, and ignore it
|
26
|
+
if e.to_s == 'unexpected end of file'
|
27
|
+
begin
|
28
|
+
while !gz.eof?
|
29
|
+
_content += gz.readchar
|
30
|
+
end
|
31
|
+
rescue => e
|
32
|
+
puts "Ignored Zlib error: #{e.to_s}"
|
33
|
+
end
|
34
|
+
else
|
35
|
+
raise e
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
return _content
|
40
|
+
ensure
|
41
|
+
gz.close if gz.respond_to?(:close)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|