answersengine 0.2.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.travis.yml +7 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +30 -0
- data/Rakefile +22 -0
- data/answersengine.gemspec +45 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/examples/fetchtest/libraries/hello.rb +9 -0
- data/examples/fetchtest/libraries/hello_fail.rb +10 -0
- data/examples/fetchtest/parsers/failed.rb +2 -0
- data/examples/fetchtest/parsers/find_outputs.rb +18 -0
- data/examples/fetchtest/parsers/home.rb +50 -0
- data/examples/fetchtest/parsers/nested_fail.rb +3 -0
- data/examples/fetchtest/parsers/simple.rb +14 -0
- data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
- data/examples/fetchtest/seeders/failed.rb +1 -0
- data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
- data/examples/fetchtest/seeders/seed.rb +28 -0
- data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
- data/exe/answersengine +3 -0
- data/lib/answersengine.rb +5 -0
- data/lib/answersengine/cli.rb +33 -0
- data/lib/answersengine/cli/global_page.rb +39 -0
- data/lib/answersengine/cli/job.rb +30 -0
- data/lib/answersengine/cli/job_output.rb +69 -0
- data/lib/answersengine/cli/parser.rb +64 -0
- data/lib/answersengine/cli/scraper.rb +172 -0
- data/lib/answersengine/cli/scraper_deployment.rb +24 -0
- data/lib/answersengine/cli/scraper_export.rb +51 -0
- data/lib/answersengine/cli/scraper_exporter.rb +40 -0
- data/lib/answersengine/cli/scraper_job.rb +71 -0
- data/lib/answersengine/cli/scraper_page.rb +200 -0
- data/lib/answersengine/cli/seeder.rb +40 -0
- data/lib/answersengine/client.rb +23 -0
- data/lib/answersengine/client/backblaze_content.rb +45 -0
- data/lib/answersengine/client/base.rb +50 -0
- data/lib/answersengine/client/export.rb +10 -0
- data/lib/answersengine/client/global_page.rb +18 -0
- data/lib/answersengine/client/job.rb +53 -0
- data/lib/answersengine/client/job_export.rb +10 -0
- data/lib/answersengine/client/job_log.rb +27 -0
- data/lib/answersengine/client/job_output.rb +19 -0
- data/lib/answersengine/client/job_page.rb +62 -0
- data/lib/answersengine/client/job_stat.rb +16 -0
- data/lib/answersengine/client/scraper.rb +54 -0
- data/lib/answersengine/client/scraper_deployment.rb +17 -0
- data/lib/answersengine/client/scraper_export.rb +22 -0
- data/lib/answersengine/client/scraper_exporter.rb +14 -0
- data/lib/answersengine/client/scraper_job.rb +49 -0
- data/lib/answersengine/client/scraper_job_output.rb +19 -0
- data/lib/answersengine/client/scraper_job_page.rb +55 -0
- data/lib/answersengine/plugin.rb +6 -0
- data/lib/answersengine/plugin/context_exposer.rb +55 -0
- data/lib/answersengine/scraper.rb +16 -0
- data/lib/answersengine/scraper/executor.rb +292 -0
- data/lib/answersengine/scraper/parser.rb +18 -0
- data/lib/answersengine/scraper/ruby_parser_executor.rb +141 -0
- data/lib/answersengine/scraper/ruby_seeder_executor.rb +114 -0
- data/lib/answersengine/scraper/seeder.rb +18 -0
- data/lib/answersengine/version.rb +3 -0
- metadata +255 -0
@@ -0,0 +1,51 @@
|
|
1
|
+
module AnswersEngine
|
2
|
+
class CLI < Thor
|
3
|
+
class ScraperExport < Thor
|
4
|
+
package_name "scraper export"
|
5
|
+
def self.banner(command, namespace = nil, subcommand = false)
|
6
|
+
"#{basename} #{@package_name} #{command.usage}"
|
7
|
+
end
|
8
|
+
|
9
|
+
desc "show <export_id>", "Show a scraper's export"
|
10
|
+
def show(export_id)
|
11
|
+
client = Client::ScraperExport.new(options)
|
12
|
+
puts "#{client.find(export_id)}"
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
desc "list", "Gets a list of exports"
|
17
|
+
long_desc <<-LONGDESC
|
18
|
+
List exports.
|
19
|
+
LONGDESC
|
20
|
+
option :scraper_name, :aliases => :s, type: :string, desc: 'Filter by a specific scraper_name'
|
21
|
+
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
22
|
+
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
23
|
+
def list()
|
24
|
+
if options[:scraper_name]
|
25
|
+
client = Client::ScraperExport.new(options)
|
26
|
+
puts "#{client.all(options[:scraper_name])}"
|
27
|
+
else
|
28
|
+
client = Client::Export.new(options)
|
29
|
+
puts "#{client.all}"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
desc "download <export_id>", "Download the exported file"
|
34
|
+
def download(export_id)
|
35
|
+
client = Client::ScraperExport.new(options)
|
36
|
+
result = JSON.parse(client.download(export_id).to_s)
|
37
|
+
|
38
|
+
if result['signed_url']
|
39
|
+
puts "Download url: \"#{result['signed_url']}\""
|
40
|
+
`open "#{result['signed_url']}"`
|
41
|
+
else
|
42
|
+
puts "Exported file does not exist"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module AnswersEngine
|
2
|
+
class CLI < Thor
|
3
|
+
class ScraperExporter < Thor
|
4
|
+
package_name "scraper exporter"
|
5
|
+
def self.banner(command, namespace = nil, subcommand = false)
|
6
|
+
"#{basename} #{@package_name} #{command.usage}"
|
7
|
+
end
|
8
|
+
|
9
|
+
desc "show <scraper_name> <exporter_name>", "Show a scraper's exporter"
|
10
|
+
def show(scraper_name, exporter_name)
|
11
|
+
client = Client::ScraperExporter.new(options)
|
12
|
+
puts "#{client.find(scraper_name, exporter_name)}"
|
13
|
+
end
|
14
|
+
|
15
|
+
desc "start <scraper_name> <exporter_name>", "Starts an export"
|
16
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
17
|
+
def start(scraper_name, exporter_name)
|
18
|
+
if options[:job]
|
19
|
+
client = Client::JobExport.new(options)
|
20
|
+
puts "#{client.create(options[:job], exporter_name)}"
|
21
|
+
else
|
22
|
+
client = Client::ScraperExport.new(options)
|
23
|
+
puts "#{client.create(scraper_name, exporter_name)}"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
desc "list <scraper_name>", "gets a list of exporters on a scraper"
|
28
|
+
long_desc <<-LONGDESC
|
29
|
+
List exporters on a scraper.
|
30
|
+
LONGDESC
|
31
|
+
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
32
|
+
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
33
|
+
def list(scraper_name)
|
34
|
+
client = Client::ScraperExporter.new(options)
|
35
|
+
puts "#{client.all(scraper_name)}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module AnswersEngine
|
2
|
+
class CLI < Thor
|
3
|
+
class ScraperJob < Thor
|
4
|
+
package_name "scraper job"
|
5
|
+
def self.banner(command, namespace = nil, subcommand = false)
|
6
|
+
"#{basename} #{@package_name} #{command.usage}"
|
7
|
+
end
|
8
|
+
|
9
|
+
desc "show <scraper_name>", "Show a scraper's current job"
|
10
|
+
def show(scraper_name)
|
11
|
+
client = Client::ScraperJob.new(options)
|
12
|
+
puts "#{client.find(scraper_name)}"
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
desc "list <scraper_name>", "gets a list of jobs on a scraper"
|
17
|
+
long_desc <<-LONGDESC
|
18
|
+
List jobs on a scraper.
|
19
|
+
LONGDESC
|
20
|
+
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
21
|
+
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
22
|
+
def list(scraper_name)
|
23
|
+
client = Client::ScraperJob.new(options)
|
24
|
+
puts "#{client.all(scraper_name)}"
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
desc "cancel <scraper_name>", "cancels a scraper's current job"
|
29
|
+
long_desc <<-LONGDESC
|
30
|
+
Cancels a scraper's current job
|
31
|
+
LONGDESC
|
32
|
+
def cancel(scraper_name)
|
33
|
+
client = Client::ScraperJob.new(options)
|
34
|
+
puts "#{client.cancel(scraper_name)}"
|
35
|
+
end
|
36
|
+
|
37
|
+
desc "resume <scraper_name>", "resumes a scraper's current job"
|
38
|
+
long_desc <<-LONGDESC
|
39
|
+
Resumes a scraper's current job
|
40
|
+
LONGDESC
|
41
|
+
def resume(scraper_name)
|
42
|
+
client = Client::ScraperJob.new(options)
|
43
|
+
puts "#{client.resume(scraper_name)}"
|
44
|
+
end
|
45
|
+
|
46
|
+
desc "pause <scraper_name>", "pauses a scraper's current job"
|
47
|
+
long_desc <<-LONGDESC
|
48
|
+
pauses a scraper's current job
|
49
|
+
LONGDESC
|
50
|
+
def pause(scraper_name)
|
51
|
+
client = Client::ScraperJob.new(options)
|
52
|
+
puts "#{client.pause(scraper_name)}"
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
desc "update <scraper_name>", "updates a scraper's current job"
|
57
|
+
long_desc <<-LONGDESC
|
58
|
+
Updates a scraper's current job.
|
59
|
+
LONGDESC
|
60
|
+
option :workers, :aliases => :w, type: :numeric, desc: 'Set how many standard workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 1. '
|
61
|
+
option :browsers, type: :numeric, desc: 'Set how many browser workers to use. Scraper job must be restarted(paused then resumed, or cancelled then resumed) for it to take effect. Default: 0. '
|
62
|
+
option :proxy_type, desc: 'Set the Proxy type. Default: standard'
|
63
|
+
def update(scraper_name)
|
64
|
+
client = Client::ScraperJob.new(options)
|
65
|
+
puts "#{client.update(scraper_name, options)}"
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
@@ -0,0 +1,200 @@
|
|
1
|
+
module AnswersEngine
|
2
|
+
class CLI < Thor
|
3
|
+
class ScraperPage < Thor
|
4
|
+
|
5
|
+
package_name "scraper page"
|
6
|
+
def self.banner(command, namespace = nil, subcommand = false)
|
7
|
+
"#{basename} #{@package_name} #{command.usage}"
|
8
|
+
end
|
9
|
+
|
10
|
+
desc "list <scraper_name>", "List Pages on a scraper's current job"
|
11
|
+
long_desc <<-LONGDESC
|
12
|
+
List all pages in a scraper's current job.\x5
|
13
|
+
LONGDESC
|
14
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
15
|
+
option :page_type, :aliases => :t, type: :string, desc: 'Filter by page_type'
|
16
|
+
option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
|
17
|
+
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
|
18
|
+
option :fetch_fail, type: :boolean, desc: 'Returns only pages that fails fetching.'
|
19
|
+
option :parse_fail, type: :boolean, desc: 'Returns only pages that fails parsing.'
|
20
|
+
def list(scraper_name)
|
21
|
+
if options[:job]
|
22
|
+
client = Client::JobPage.new(options)
|
23
|
+
puts "#{client.all(options[:job])}"
|
24
|
+
else
|
25
|
+
client = Client::ScraperJobPage.new(options)
|
26
|
+
puts "#{client.all(scraper_name)}"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
desc "add <scraper_name> <url>", "Enqueues a page to a scraper's current job"
|
31
|
+
long_desc <<-LONGDESC
|
32
|
+
Enqueues a page to a scraper's current job\x5
|
33
|
+
LONGDESC
|
34
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
35
|
+
option :method, :aliases => :m, type: :string, desc: 'Set request method. Default: GET'
|
36
|
+
option :headers, :aliases => :H, type: :string, banner: :JSON, desc: 'Set request headers. Must be in json format. i.e: {"Foo":"bar"} '
|
37
|
+
option :cookie, :aliases => :c, type: :string, desc: 'Set request cookie.'
|
38
|
+
option :vars, :aliases => :v, type: :string, banner: :JSON, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
39
|
+
option :page_type, :aliases => :t, desc: 'Set page type'
|
40
|
+
option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
|
41
|
+
option :fetch_type, :aliases => :F, desc: 'Set fetch type. Default: http'
|
42
|
+
option :body, :aliases => :b, desc: 'Set request body'
|
43
|
+
option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
|
44
|
+
option :freshness, :aliases => :s, desc: 'Set how fresh the page cache is. Accepts timestap format.'
|
45
|
+
option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
|
46
|
+
option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
|
47
|
+
def add(scraper_name, url)
|
48
|
+
begin
|
49
|
+
options[:headers] = JSON.parse(options[:headers]) if options[:headers]
|
50
|
+
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
51
|
+
method = options[:method]
|
52
|
+
|
53
|
+
if options[:job]
|
54
|
+
client = Client::JobPage.new(options)
|
55
|
+
puts "#{client.enqueue(options[:job], method, url, options)}"
|
56
|
+
else
|
57
|
+
client = Client::ScraperJobPage.new(options)
|
58
|
+
puts "#{client.enqueue(scraper_name, method, url, options)}"
|
59
|
+
end
|
60
|
+
|
61
|
+
rescue JSON::ParserError
|
62
|
+
if options[:headers]
|
63
|
+
puts "Error: #{options[:headers]} on headers is not a valid JSON"
|
64
|
+
end
|
65
|
+
if options[:vars]
|
66
|
+
puts "Error: #{options[:vars]} on vars is not a valid JSON"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
desc "update <scraper_name> <gid>", "Update a page in a scraper's current job"
|
73
|
+
long_desc <<-LONGDESC
|
74
|
+
Updates a page in a scraper's current job. Only page_type or page vars is updateable.\x5
|
75
|
+
LONGDESC
|
76
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
77
|
+
option :page_type, :aliases => :t, desc: 'Set page type'
|
78
|
+
option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
|
79
|
+
option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
|
80
|
+
def update(scraper_name, gid)
|
81
|
+
begin
|
82
|
+
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
83
|
+
|
84
|
+
if options[:job]
|
85
|
+
client = Client::JobPage.new(options)
|
86
|
+
puts "#{client.update(options[:job], gid, options)}"
|
87
|
+
else
|
88
|
+
client = Client::ScraperJobPage.new(options)
|
89
|
+
puts "#{client.update(scraper_name, gid, options)}"
|
90
|
+
end
|
91
|
+
|
92
|
+
rescue JSON::ParserError
|
93
|
+
if options[:vars]
|
94
|
+
puts "Error: #{options[:vars]} on vars is not a valid JSON"
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
desc "refetch <scraper_name>", "Refetch Pages on a scraper's current job"
|
100
|
+
long_desc <<-LONGDESC
|
101
|
+
Refetch pages in a scraper's current job. You need to specify either a --gid or --fetch-fail or --parse-fail.\x5
|
102
|
+
LONGDESC
|
103
|
+
option :gid, :aliases => :g, type: :string, desc: 'Refetch a specific GID'
|
104
|
+
option :fetch_fail, type: :boolean, desc: 'Refetches only pages that fails fetching.'
|
105
|
+
option :parse_fail, type: :boolean, desc: 'Refetches only pages that fails parsing.'
|
106
|
+
def refetch(scraper_name)
|
107
|
+
if !options.key?(:gid) && !options.key?(:fetch_fail) && !options.key?(:parse_fail)
|
108
|
+
puts "Must specify either a --gid or --fetch-fail or --parse-fail"
|
109
|
+
else
|
110
|
+
client = Client::ScraperJobPage.new(options)
|
111
|
+
puts "#{client.refetch(scraper_name)}"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
desc "reset <scraper_name> <gid>", "Reset fetching and parsing of a page in a scraper's current job"
|
116
|
+
long_desc <<-LONGDESC
|
117
|
+
Reset fetching and parsing of a page in a scraper's current job.\x5
|
118
|
+
LONGDESC
|
119
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
120
|
+
def reset(scraper_name, gid)
|
121
|
+
begin
|
122
|
+
options[:vars] = JSON.parse(options[:vars]) if options[:vars]
|
123
|
+
|
124
|
+
if options[:job]
|
125
|
+
client = Client::JobPage.new(options)
|
126
|
+
puts "#{client.reset(options[:job], gid, options)}"
|
127
|
+
else
|
128
|
+
client = Client::ScraperJobPage.new(options)
|
129
|
+
puts "#{client.reset(scraper_name, gid, options)}"
|
130
|
+
end
|
131
|
+
|
132
|
+
rescue JSON::ParserError
|
133
|
+
if options[:vars]
|
134
|
+
puts "Error: #{options[:vars]} on vars is not a valid JSON"
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
desc "show <scraper_name> <gid>", "Show a page in scraper's current job"
|
140
|
+
long_desc <<-LONGDESC
|
141
|
+
Shows a page in a scraper's current job.\x5
|
142
|
+
LONGDESC
|
143
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
144
|
+
def show(scraper_name, gid)
|
145
|
+
if options[:job]
|
146
|
+
client = Client::JobPage.new(options)
|
147
|
+
puts "#{client.find(options[:job], gid)}"
|
148
|
+
else
|
149
|
+
client = Client::ScraperJobPage.new(options)
|
150
|
+
puts "#{client.find(scraper_name, gid)}"
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
desc "log <scraper_name> <gid>", "List log entries related to a job page"
|
155
|
+
long_desc <<-LONGDESC
|
156
|
+
Shows log related to a page in the job. Defaults to showing the most recent entries\x5
|
157
|
+
LONGDESC
|
158
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
159
|
+
option :head, :aliases => :H, desc: 'Show the oldest log entries. If not set, newest entries is shown'
|
160
|
+
option :parsing, :aliases => :p, type: :boolean, desc: 'Show only log entries related to parsing'
|
161
|
+
option :more, :aliases => :m, desc: 'Show next set of log entries. Enter the `More token`'
|
162
|
+
option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 5000 per page.'
|
163
|
+
def log(scraper_name, gid)
|
164
|
+
client = Client::JobLog.new(options)
|
165
|
+
|
166
|
+
query = {}
|
167
|
+
query["order"] = options.delete(:head) if options[:head]
|
168
|
+
query["job_type"] = "parsing" if options[:parsing]
|
169
|
+
|
170
|
+
query["page_token"] = options.delete(:more) if options[:more]
|
171
|
+
query["per_page"] = options.delete(:per_page) if options[:per_page]
|
172
|
+
|
173
|
+
puts "Fetching page logs..."
|
174
|
+
|
175
|
+
if options[:job]
|
176
|
+
result = client.all_job_page_log(options[:job], gid, {query: query})
|
177
|
+
else
|
178
|
+
result = client.scraper_all_job_page_log(scraper_name, gid, {query: query})
|
179
|
+
end
|
180
|
+
|
181
|
+
if result['entries'].nil? || result["entries"].length == 0
|
182
|
+
puts "No logs yet, please try again later."
|
183
|
+
else
|
184
|
+
|
185
|
+
more_token = result["more_token"]
|
186
|
+
|
187
|
+
result["entries"].each do |entry|
|
188
|
+
puts "#{entry["timestamp"]} #{entry["severity"]}: #{entry["payload"]}" if entry.is_a?(Hash)
|
189
|
+
end
|
190
|
+
|
191
|
+
unless more_token.nil?
|
192
|
+
puts "to see more entries, add: \"--more #{more_token}\""
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module AnswersEngine
|
2
|
+
class CLI < Thor
|
3
|
+
class Seeder < Thor
|
4
|
+
desc "try <scraper_name> <seeder_file>", "Tries a seeder file"
|
5
|
+
long_desc <<-LONGDESC
|
6
|
+
Takes a seeder script and tries to execute it without saving anything.\x5
|
7
|
+
<seeder_file>: Seeder script file will be executed.\x5
|
8
|
+
LONGDESC
|
9
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
10
|
+
def try_seed(scraper_name, seeder_file)
|
11
|
+
if options[:job]
|
12
|
+
job_id = options[:job]
|
13
|
+
else
|
14
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
15
|
+
job_id = job['id']
|
16
|
+
end
|
17
|
+
|
18
|
+
puts AnswersEngine::Scraper::Seeder.exec_seeder(seeder_file, job_id, false)
|
19
|
+
end
|
20
|
+
|
21
|
+
desc "exec <scraper_name> <seeder_file>", "Executes a seeder script onto a scraper's current job."
|
22
|
+
long_desc <<-LONGDESC
|
23
|
+
Takes a seeder script and execute it against a job and enqueues the pages into the scraper's current job\x5
|
24
|
+
<seeder_file>: Seeder script file that will be executed on the scraper's current job.\x5
|
25
|
+
LONGDESC
|
26
|
+
option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
|
27
|
+
def exec_parse(scraper_name, seeder_file)
|
28
|
+
if options[:job]
|
29
|
+
job_id = options[:job]
|
30
|
+
else
|
31
|
+
job = Client::ScraperJob.new(options).find(scraper_name)
|
32
|
+
job_id = job['id']
|
33
|
+
end
|
34
|
+
|
35
|
+
puts AnswersEngine::Scraper::Seeder.exec_seeder(seeder_file, job_id, true)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require "answersengine/client/base"
|
2
|
+
require 'answersengine/client/export'
|
3
|
+
require "answersengine/client/scraper"
|
4
|
+
require "answersengine/client/scraper_deployment"
|
5
|
+
require "answersengine/client/scraper_job_output"
|
6
|
+
require "answersengine/client/scraper_job_page"
|
7
|
+
require "answersengine/client/scraper_exporter"
|
8
|
+
require "answersengine/client/scraper_export"
|
9
|
+
require "answersengine/client/scraper_job"
|
10
|
+
require 'answersengine/client/job_export'
|
11
|
+
require "answersengine/client/job"
|
12
|
+
require "answersengine/client/job_log"
|
13
|
+
require "answersengine/client/global_page"
|
14
|
+
require "answersengine/client/job_page"
|
15
|
+
require "answersengine/client/job_output"
|
16
|
+
require "answersengine/client/job_stat"
|
17
|
+
require "answersengine/client/backblaze_content"
|
18
|
+
|
19
|
+
|
20
|
+
module AnswersEngine
|
21
|
+
module Client
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'zlib'
|
2
|
+
require 'httparty'
|
3
|
+
|
4
|
+
module AnswersEngine
|
5
|
+
module Client
|
6
|
+
class BackblazeContent
|
7
|
+
include HTTParty
|
8
|
+
|
9
|
+
def get_content(url)
|
10
|
+
self.class.get(url, format: :plain)
|
11
|
+
end
|
12
|
+
|
13
|
+
def get_gunzipped_content(url)
|
14
|
+
# Zlib.gunzip(get_content(url))
|
15
|
+
gunzip(get_content(url))
|
16
|
+
end
|
17
|
+
|
18
|
+
def gunzip(string)
|
19
|
+
sio = StringIO.new(string)
|
20
|
+
gz = Zlib::GzipReader.new(sio, encoding: Encoding::ASCII_8BIT)
|
21
|
+
_content = ""
|
22
|
+
begin
|
23
|
+
_content = gz.read
|
24
|
+
rescue => e
|
25
|
+
# if unexpected eof error, then readchar until error, and ignore it
|
26
|
+
if e.to_s == 'unexpected end of file'
|
27
|
+
begin
|
28
|
+
while !gz.eof?
|
29
|
+
_content += gz.readchar
|
30
|
+
end
|
31
|
+
rescue => e
|
32
|
+
puts "Ignored Zlib error: #{e.to_s}"
|
33
|
+
end
|
34
|
+
else
|
35
|
+
raise e
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
return _content
|
40
|
+
ensure
|
41
|
+
gz.close if gz.respond_to?(:close)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|