answersengine 0.2.33
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.travis.yml +7 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +30 -0
- data/Rakefile +22 -0
- data/answersengine.gemspec +45 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/examples/fetchtest/libraries/hello.rb +9 -0
- data/examples/fetchtest/libraries/hello_fail.rb +10 -0
- data/examples/fetchtest/parsers/failed.rb +2 -0
- data/examples/fetchtest/parsers/find_outputs.rb +18 -0
- data/examples/fetchtest/parsers/home.rb +50 -0
- data/examples/fetchtest/parsers/nested_fail.rb +3 -0
- data/examples/fetchtest/parsers/simple.rb +14 -0
- data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
- data/examples/fetchtest/seeders/failed.rb +1 -0
- data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
- data/examples/fetchtest/seeders/seed.rb +28 -0
- data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
- data/exe/answersengine +3 -0
- data/lib/answersengine.rb +5 -0
- data/lib/answersengine/cli.rb +33 -0
- data/lib/answersengine/cli/global_page.rb +39 -0
- data/lib/answersengine/cli/job.rb +30 -0
- data/lib/answersengine/cli/job_output.rb +69 -0
- data/lib/answersengine/cli/parser.rb +64 -0
- data/lib/answersengine/cli/scraper.rb +172 -0
- data/lib/answersengine/cli/scraper_deployment.rb +24 -0
- data/lib/answersengine/cli/scraper_export.rb +51 -0
- data/lib/answersengine/cli/scraper_exporter.rb +40 -0
- data/lib/answersengine/cli/scraper_job.rb +71 -0
- data/lib/answersengine/cli/scraper_page.rb +200 -0
- data/lib/answersengine/cli/seeder.rb +40 -0
- data/lib/answersengine/client.rb +23 -0
- data/lib/answersengine/client/backblaze_content.rb +45 -0
- data/lib/answersengine/client/base.rb +50 -0
- data/lib/answersengine/client/export.rb +10 -0
- data/lib/answersengine/client/global_page.rb +18 -0
- data/lib/answersengine/client/job.rb +53 -0
- data/lib/answersengine/client/job_export.rb +10 -0
- data/lib/answersengine/client/job_log.rb +27 -0
- data/lib/answersengine/client/job_output.rb +19 -0
- data/lib/answersengine/client/job_page.rb +62 -0
- data/lib/answersengine/client/job_stat.rb +16 -0
- data/lib/answersengine/client/scraper.rb +54 -0
- data/lib/answersengine/client/scraper_deployment.rb +17 -0
- data/lib/answersengine/client/scraper_export.rb +22 -0
- data/lib/answersengine/client/scraper_exporter.rb +14 -0
- data/lib/answersengine/client/scraper_job.rb +49 -0
- data/lib/answersengine/client/scraper_job_output.rb +19 -0
- data/lib/answersengine/client/scraper_job_page.rb +55 -0
- data/lib/answersengine/plugin.rb +6 -0
- data/lib/answersengine/plugin/context_exposer.rb +55 -0
- data/lib/answersengine/scraper.rb +16 -0
- data/lib/answersengine/scraper/executor.rb +292 -0
- data/lib/answersengine/scraper/parser.rb +18 -0
- data/lib/answersengine/scraper/ruby_parser_executor.rb +141 -0
- data/lib/answersengine/scraper/ruby_seeder_executor.rb +114 -0
- data/lib/answersengine/scraper/seeder.rb +18 -0
- data/lib/answersengine/version.rb +3 -0
- metadata +255 -0
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'httparty'
|
2
|
+
|
3
|
+
module AnswersEngine
|
4
|
+
module Client
|
5
|
+
class Base
|
6
|
+
include HTTParty
|
7
|
+
base_uri(ENV['ANSWERSENGINE_API_URL'].nil? ? 'https://fetch.answersengine.com/api/v1' : ENV['ANSWERSENGINE_API_URL'])
|
8
|
+
|
9
|
+
def self.env_auth_token
|
10
|
+
ENV['ANSWERSENGINE_TOKEN']
|
11
|
+
end
|
12
|
+
|
13
|
+
def auth_token
|
14
|
+
@auth_token ||= self.class.env_auth_token
|
15
|
+
end
|
16
|
+
|
17
|
+
def auth_token= value
|
18
|
+
@auth_token = value
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize(opts={})
|
22
|
+
self.auth_token = opts[:auth_token] unless opts[:auth_token].nil?
|
23
|
+
@options = { headers: {
|
24
|
+
"Authorization" => "Bearer #{auth_token}",
|
25
|
+
"Content-Type" => "application/json",
|
26
|
+
}}
|
27
|
+
|
28
|
+
query = {}
|
29
|
+
query[:p] = opts[:page] if opts[:page]
|
30
|
+
query[:pp] = opts[:per_page] if opts[:per_page]
|
31
|
+
query[:fetchfail] = opts[:fetch_fail] if opts[:fetch_fail]
|
32
|
+
query[:parsefail] = opts[:parse_fail] if opts[:parse_fail]
|
33
|
+
query[:page_type] = opts[:page_type] if opts[:page_type]
|
34
|
+
query[:gid] = opts[:gid] if opts[:gid]
|
35
|
+
|
36
|
+
if opts[:query]
|
37
|
+
if opts[:query].is_a?(Hash)
|
38
|
+
query[:q] = opts[:query].to_json
|
39
|
+
elsif opts[:query].is_a?(String)
|
40
|
+
query[:q] = JSON.parse(opts[:query]).to_json
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
unless query.empty?
|
45
|
+
@options.merge!(query: query)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module AnswersEngine
|
2
|
+
module Client
|
3
|
+
class GlobalPage < AnswersEngine::Client::Base
|
4
|
+
def find(gid)
|
5
|
+
self.class.get("/global_pages/#{gid}", @options)
|
6
|
+
end
|
7
|
+
|
8
|
+
def find_content(gid)
|
9
|
+
self.class.get("/global_pages/#{gid}/content", @options)
|
10
|
+
end
|
11
|
+
|
12
|
+
def find_failed_content(gid)
|
13
|
+
self.class.get("/global_pages/#{gid}/failed_content", @options)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module AnswersEngine
|
2
|
+
module Client
|
3
|
+
class Job < AnswersEngine::Client::Base
|
4
|
+
def all(opts={})
|
5
|
+
self.class.get("/jobs", @options)
|
6
|
+
end
|
7
|
+
|
8
|
+
def find(job_id)
|
9
|
+
self.class.get("/jobs/#{job_id}", @options)
|
10
|
+
end
|
11
|
+
|
12
|
+
def update(job_id, opts={})
|
13
|
+
body = {}
|
14
|
+
body[:status] = opts[:status] if opts[:status]
|
15
|
+
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
16
|
+
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
17
|
+
@options.merge!({body: body.to_json})
|
18
|
+
|
19
|
+
self.class.put("/jobs/#{job_id}", @options)
|
20
|
+
end
|
21
|
+
|
22
|
+
def cancel(job_id, opts={})
|
23
|
+
opts[:status] = 'cancelled'
|
24
|
+
update(job_id, opts)
|
25
|
+
end
|
26
|
+
|
27
|
+
def resume(job_id, opts={})
|
28
|
+
opts[:status] = 'active'
|
29
|
+
update(job_id, opts)
|
30
|
+
end
|
31
|
+
|
32
|
+
def pause(job_id, opts={})
|
33
|
+
opts[:status] = 'paused'
|
34
|
+
update(job_id, opts)
|
35
|
+
end
|
36
|
+
|
37
|
+
def seeding_update(job_id, opts={})
|
38
|
+
body = {}
|
39
|
+
body[:outputs] = opts.fetch(:outputs) {[]}
|
40
|
+
body[:pages] = opts.fetch(:pages) {[]}
|
41
|
+
body[:seeding_status] = opts.fetch(:seeding_status){ nil }
|
42
|
+
body[:log_error] = opts[:log_error] if opts[:log_error]
|
43
|
+
|
44
|
+
@options.merge!({body: body.to_json})
|
45
|
+
|
46
|
+
self.class.put("/jobs/#{job_id}/seeding_update", @options)
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module AnswersEngine
|
2
|
+
module Client
|
3
|
+
class JobLog < AnswersEngine::Client::Base
|
4
|
+
def all_job_page_log(job_id, gid, opts={})
|
5
|
+
@options.merge!(opts)
|
6
|
+
self.class.get("/jobs/#{job_id}/pages/#{gid}/log", @options)
|
7
|
+
end
|
8
|
+
|
9
|
+
def scraper_all_job_page_log(scraper_name, gid, opts={})
|
10
|
+
@options.merge!(opts)
|
11
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}/log", @options)
|
12
|
+
end
|
13
|
+
|
14
|
+
def all_job_log(job_id, opts={})
|
15
|
+
@options.merge!(opts)
|
16
|
+
self.class.get("/jobs/#{job_id}/log", @options)
|
17
|
+
end
|
18
|
+
|
19
|
+
def scraper_all_job_log(scraper_name, opts={})
|
20
|
+
@options.merge!(opts)
|
21
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/log", @options)
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module AnswersEngine
|
2
|
+
module Client
|
3
|
+
class JobOutput < AnswersEngine::Client::Base
|
4
|
+
def find(job_id, collection, id)
|
5
|
+
self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records/#{id}", @options)
|
6
|
+
end
|
7
|
+
|
8
|
+
def all(job_id, collection = 'default')
|
9
|
+
|
10
|
+
self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
|
11
|
+
end
|
12
|
+
|
13
|
+
def collections(job_id)
|
14
|
+
self.class.get("/jobs/#{job_id}/output/collections", @options)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module AnswersEngine
|
2
|
+
module Client
|
3
|
+
class JobPage < AnswersEngine::Client::Base
|
4
|
+
def find(job_id, gid)
|
5
|
+
self.class.get("/jobs/#{job_id}/pages/#{gid}", @options)
|
6
|
+
end
|
7
|
+
|
8
|
+
def all(job_id, opts={})
|
9
|
+
self.class.get("/jobs/#{job_id}/pages", @options)
|
10
|
+
end
|
11
|
+
|
12
|
+
def update(job_id, gid, opts={})
|
13
|
+
body = {}
|
14
|
+
body[:page_type] = opts[:page_type] if opts[:page_type]
|
15
|
+
body[:priority] = opts[:priority] if opts[:priority]
|
16
|
+
body[:vars] = opts[:vars] if opts[:vars]
|
17
|
+
|
18
|
+
@options.merge!({body: body.to_json})
|
19
|
+
|
20
|
+
self.class.put("/jobs/#{job_id}/pages/#{gid}", @options)
|
21
|
+
end
|
22
|
+
|
23
|
+
def reset(job_id, gid, opts={})
|
24
|
+
self.class.put("/jobs/#{job_id}/pages/#{gid}/reset", @options)
|
25
|
+
end
|
26
|
+
|
27
|
+
def enqueue(job_id, method, url, opts={})
|
28
|
+
body = {}
|
29
|
+
body[:method] = method != "" ? method : "GET"
|
30
|
+
body[:url] = url
|
31
|
+
body[:page_type] = opts[:page_type] if opts[:page_type]
|
32
|
+
body[:priority] = opts[:priority] if opts[:priority]
|
33
|
+
body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
|
34
|
+
body[:body] = opts[:body] if opts[:body]
|
35
|
+
body[:headers] = opts[:headers] if opts[:headers]
|
36
|
+
body[:vars] = opts[:vars] if opts[:vars]
|
37
|
+
body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
|
38
|
+
body[:freshness] = opts[:freshness] if opts[:freshness]
|
39
|
+
body[:ua_type] = opts[:ua_type] if opts[:ua_type]
|
40
|
+
body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
|
41
|
+
body[:cookie] = opts[:cookie] if opts[:cookie]
|
42
|
+
|
43
|
+
@options.merge!({body: body.to_json})
|
44
|
+
|
45
|
+
self.class.post("/jobs/#{job_id}/pages", @options)
|
46
|
+
end
|
47
|
+
|
48
|
+
def parsing_update(job_id, gid, opts={})
|
49
|
+
body = {}
|
50
|
+
body[:outputs] = opts.fetch(:outputs) {[]}
|
51
|
+
body[:pages] = opts.fetch(:pages) {[]}
|
52
|
+
body[:parsing_status] = opts.fetch(:parsing_status){ nil }
|
53
|
+
body[:log_error] = opts[:log_error] if opts[:log_error]
|
54
|
+
|
55
|
+
@options.merge!({body: body.to_json})
|
56
|
+
|
57
|
+
self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", @options)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module AnswersEngine
|
2
|
+
module Client
|
3
|
+
class JobStat < AnswersEngine::Client::Base
|
4
|
+
|
5
|
+
def job_current_stats(job_id)
|
6
|
+
self.class.get("/jobs/#{job_id}/stats/current", @options)
|
7
|
+
end
|
8
|
+
|
9
|
+
def scraper_job_current_stats(scraper_name)
|
10
|
+
self.class.get("/scrapers/#{scraper_name}/current_job/stats/current", @options)
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module AnswersEngine
|
2
|
+
module Client
|
3
|
+
class Scraper < AnswersEngine::Client::Base
|
4
|
+
|
5
|
+
def find(scraper_name)
|
6
|
+
self.class.get("/scrapers/#{scraper_name}", @options)
|
7
|
+
end
|
8
|
+
|
9
|
+
def all(opts={})
|
10
|
+
self.class.get("/scrapers", @options)
|
11
|
+
end
|
12
|
+
|
13
|
+
def create(scraper_name, git_repository, opts={})
|
14
|
+
body = {
|
15
|
+
name: scraper_name,
|
16
|
+
git_repository: git_repository,
|
17
|
+
git_branch: opts[:branch] ? opts[:branch] : "master"}
|
18
|
+
|
19
|
+
body[:freshness_type] = opts[:freshness_type] if opts[:freshness_type]
|
20
|
+
body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
|
21
|
+
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
22
|
+
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
23
|
+
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
24
|
+
body[:disable_scheduler] = opts[:disable_scheduler] if opts[:disable_scheduler]
|
25
|
+
body[:cancel_current_job] = opts[:cancel_current_job] if opts[:cancel_current_job]
|
26
|
+
body[:schedule] = opts[:schedule] if opts[:schedule]
|
27
|
+
body[:timezone] = opts[:timezone] if opts[:timezone]
|
28
|
+
@options.merge!({body: body.to_json})
|
29
|
+
self.class.post("/scrapers", @options)
|
30
|
+
end
|
31
|
+
|
32
|
+
def update(scraper_name, opts={})
|
33
|
+
body = {}
|
34
|
+
|
35
|
+
body[:name] = opts[:name] if opts[:name]
|
36
|
+
body[:git_repository] = opts[:repo] if opts[:repo]
|
37
|
+
body[:git_branch] = opts[:branch] if opts[:branch]
|
38
|
+
body[:freshness_type] = opts[:freshness_type] if opts[:freshness_type]
|
39
|
+
body[:force_fetch] = opts[:force_fetch] if opts.has_key?("force_fetch")
|
40
|
+
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
41
|
+
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
42
|
+
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
43
|
+
body[:disable_scheduler] = opts[:disable_scheduler] if opts.has_key?("disable_scheduler")
|
44
|
+
body[:cancel_current_job] = opts[:cancel_current_job] if opts.has_key?("cancel_current_job")
|
45
|
+
body[:schedule] = opts[:schedule] if opts[:schedule]
|
46
|
+
body[:timezone] = opts[:timezone] if opts[:timezone]
|
47
|
+
@options.merge!({body: body.to_json})
|
48
|
+
|
49
|
+
self.class.put("/scrapers/#{scraper_name}", @options)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module AnswersEngine
|
2
|
+
module Client
|
3
|
+
class ScraperDeployment < AnswersEngine::Client::Base
|
4
|
+
|
5
|
+
def all(scraper_name, opts={})
|
6
|
+
self.class.get("/scrapers/#{scraper_name}/deployments", @options)
|
7
|
+
end
|
8
|
+
|
9
|
+
|
10
|
+
def deploy(scraper_name, opts={})
|
11
|
+
self.class.post("/scrapers/#{scraper_name}/deployments", @options)
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module AnswersEngine
|
2
|
+
module Client
|
3
|
+
class ScraperExport < AnswersEngine::Client::Base
|
4
|
+
def all(scraper_name, opts={})
|
5
|
+
self.class.get("/scrapers/#{scraper_name}/exports", @options)
|
6
|
+
end
|
7
|
+
|
8
|
+
def find(export_id)
|
9
|
+
self.class.get("/scrapers/exports/#{export_id}", @options)
|
10
|
+
end
|
11
|
+
|
12
|
+
def create(scraper_name, exporter_name)
|
13
|
+
self.class.post("/scrapers/#{scraper_name}/exports/#{exporter_name}", @options)
|
14
|
+
end
|
15
|
+
|
16
|
+
def download(export_id)
|
17
|
+
self.class.get("/scrapers/exports/#{export_id}/download", @options)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module AnswersEngine
|
2
|
+
module Client
|
3
|
+
class ScraperExporter < AnswersEngine::Client::Base
|
4
|
+
def all(scraper_name, opts={})
|
5
|
+
self.class.get("/scrapers/#{scraper_name}/exporters", @options)
|
6
|
+
end
|
7
|
+
|
8
|
+
def find(scraper_name, exporter_name)
|
9
|
+
self.class.get("/scrapers/#{scraper_name}/exporters/#{exporter_name}", @options)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module AnswersEngine
|
2
|
+
module Client
|
3
|
+
class ScraperJob < AnswersEngine::Client::Base
|
4
|
+
def all(scraper_name, opts={})
|
5
|
+
self.class.get("/scrapers/#{scraper_name}/jobs", @options)
|
6
|
+
end
|
7
|
+
|
8
|
+
def create(scraper_name, opts={})
|
9
|
+
body = {}
|
10
|
+
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
11
|
+
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
12
|
+
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
13
|
+
@options.merge!({body: body.to_json})
|
14
|
+
self.class.post("/scrapers/#{scraper_name}/jobs", @options)
|
15
|
+
end
|
16
|
+
|
17
|
+
def find(scraper_name)
|
18
|
+
self.class.get("/scrapers/#{scraper_name}/current_job", @options)
|
19
|
+
end
|
20
|
+
|
21
|
+
def update(scraper_name, opts={})
|
22
|
+
body = {}
|
23
|
+
body[:status] = opts[:status] if opts[:status]
|
24
|
+
body[:standard_worker_count] = opts[:workers] if opts[:workers]
|
25
|
+
body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
|
26
|
+
body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
|
27
|
+
@options.merge!({body: body.to_json})
|
28
|
+
|
29
|
+
self.class.put("/scrapers/#{scraper_name}/current_job", @options)
|
30
|
+
end
|
31
|
+
|
32
|
+
def cancel(scraper_name, opts={})
|
33
|
+
opts[:status] = 'cancelled'
|
34
|
+
update(scraper_name, opts)
|
35
|
+
end
|
36
|
+
|
37
|
+
def resume(scraper_name, opts={})
|
38
|
+
opts[:status] = 'active'
|
39
|
+
update(scraper_name, opts)
|
40
|
+
end
|
41
|
+
|
42
|
+
def pause(scraper_name, opts={})
|
43
|
+
opts[:status] = 'paused'
|
44
|
+
update(scraper_name, opts)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|