answersengine 0.2.33

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +30 -0
  8. data/Rakefile +22 -0
  9. data/answersengine.gemspec +45 -0
  10. data/bin/console +14 -0
  11. data/bin/setup +8 -0
  12. data/examples/fetchtest/libraries/hello.rb +9 -0
  13. data/examples/fetchtest/libraries/hello_fail.rb +10 -0
  14. data/examples/fetchtest/parsers/failed.rb +2 -0
  15. data/examples/fetchtest/parsers/find_outputs.rb +18 -0
  16. data/examples/fetchtest/parsers/home.rb +50 -0
  17. data/examples/fetchtest/parsers/nested_fail.rb +3 -0
  18. data/examples/fetchtest/parsers/simple.rb +14 -0
  19. data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
  20. data/examples/fetchtest/seeders/failed.rb +1 -0
  21. data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
  22. data/examples/fetchtest/seeders/seed.rb +28 -0
  23. data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
  24. data/exe/answersengine +3 -0
  25. data/lib/answersengine.rb +5 -0
  26. data/lib/answersengine/cli.rb +33 -0
  27. data/lib/answersengine/cli/global_page.rb +39 -0
  28. data/lib/answersengine/cli/job.rb +30 -0
  29. data/lib/answersengine/cli/job_output.rb +69 -0
  30. data/lib/answersengine/cli/parser.rb +64 -0
  31. data/lib/answersengine/cli/scraper.rb +172 -0
  32. data/lib/answersengine/cli/scraper_deployment.rb +24 -0
  33. data/lib/answersengine/cli/scraper_export.rb +51 -0
  34. data/lib/answersengine/cli/scraper_exporter.rb +40 -0
  35. data/lib/answersengine/cli/scraper_job.rb +71 -0
  36. data/lib/answersengine/cli/scraper_page.rb +200 -0
  37. data/lib/answersengine/cli/seeder.rb +40 -0
  38. data/lib/answersengine/client.rb +23 -0
  39. data/lib/answersengine/client/backblaze_content.rb +45 -0
  40. data/lib/answersengine/client/base.rb +50 -0
  41. data/lib/answersengine/client/export.rb +10 -0
  42. data/lib/answersengine/client/global_page.rb +18 -0
  43. data/lib/answersengine/client/job.rb +53 -0
  44. data/lib/answersengine/client/job_export.rb +10 -0
  45. data/lib/answersengine/client/job_log.rb +27 -0
  46. data/lib/answersengine/client/job_output.rb +19 -0
  47. data/lib/answersengine/client/job_page.rb +62 -0
  48. data/lib/answersengine/client/job_stat.rb +16 -0
  49. data/lib/answersengine/client/scraper.rb +54 -0
  50. data/lib/answersengine/client/scraper_deployment.rb +17 -0
  51. data/lib/answersengine/client/scraper_export.rb +22 -0
  52. data/lib/answersengine/client/scraper_exporter.rb +14 -0
  53. data/lib/answersengine/client/scraper_job.rb +49 -0
  54. data/lib/answersengine/client/scraper_job_output.rb +19 -0
  55. data/lib/answersengine/client/scraper_job_page.rb +55 -0
  56. data/lib/answersengine/plugin.rb +6 -0
  57. data/lib/answersengine/plugin/context_exposer.rb +55 -0
  58. data/lib/answersengine/scraper.rb +16 -0
  59. data/lib/answersengine/scraper/executor.rb +292 -0
  60. data/lib/answersengine/scraper/parser.rb +18 -0
  61. data/lib/answersengine/scraper/ruby_parser_executor.rb +141 -0
  62. data/lib/answersengine/scraper/ruby_seeder_executor.rb +114 -0
  63. data/lib/answersengine/scraper/seeder.rb +18 -0
  64. data/lib/answersengine/version.rb +3 -0
  65. metadata +255 -0
@@ -0,0 +1,50 @@
1
+ require 'httparty'
2
+
3
+ module AnswersEngine
4
+ module Client
5
+ class Base
6
+ include HTTParty
7
+ base_uri(ENV['ANSWERSENGINE_API_URL'].nil? ? 'https://fetch.answersengine.com/api/v1' : ENV['ANSWERSENGINE_API_URL'])
8
+
9
+ def self.env_auth_token
10
+ ENV['ANSWERSENGINE_TOKEN']
11
+ end
12
+
13
+ def auth_token
14
+ @auth_token ||= self.class.env_auth_token
15
+ end
16
+
17
+ def auth_token= value
18
+ @auth_token = value
19
+ end
20
+
21
+ def initialize(opts={})
22
+ self.auth_token = opts[:auth_token] unless opts[:auth_token].nil?
23
+ @options = { headers: {
24
+ "Authorization" => "Bearer #{auth_token}",
25
+ "Content-Type" => "application/json",
26
+ }}
27
+
28
+ query = {}
29
+ query[:p] = opts[:page] if opts[:page]
30
+ query[:pp] = opts[:per_page] if opts[:per_page]
31
+ query[:fetchfail] = opts[:fetch_fail] if opts[:fetch_fail]
32
+ query[:parsefail] = opts[:parse_fail] if opts[:parse_fail]
33
+ query[:page_type] = opts[:page_type] if opts[:page_type]
34
+ query[:gid] = opts[:gid] if opts[:gid]
35
+
36
+ if opts[:query]
37
+ if opts[:query].is_a?(Hash)
38
+ query[:q] = opts[:query].to_json
39
+ elsif opts[:query].is_a?(String)
40
+ query[:q] = JSON.parse(opts[:query]).to_json
41
+ end
42
+ end
43
+
44
+ unless query.empty?
45
+ @options.merge!(query: query)
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,10 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class Export < AnswersEngine::Client::Base
4
+ def all(opts={})
5
+ self.class.get("/scrapers/exports", @options)
6
+ end
7
+ end
8
+ end
9
+ end
10
+
@@ -0,0 +1,18 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class GlobalPage < AnswersEngine::Client::Base
4
+ def find(gid)
5
+ self.class.get("/global_pages/#{gid}", @options)
6
+ end
7
+
8
+ def find_content(gid)
9
+ self.class.get("/global_pages/#{gid}/content", @options)
10
+ end
11
+
12
+ def find_failed_content(gid)
13
+ self.class.get("/global_pages/#{gid}/failed_content", @options)
14
+ end
15
+ end
16
+ end
17
+ end
18
+
@@ -0,0 +1,53 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class Job < AnswersEngine::Client::Base
4
+ def all(opts={})
5
+ self.class.get("/jobs", @options)
6
+ end
7
+
8
+ def find(job_id)
9
+ self.class.get("/jobs/#{job_id}", @options)
10
+ end
11
+
12
+ def update(job_id, opts={})
13
+ body = {}
14
+ body[:status] = opts[:status] if opts[:status]
15
+ body[:standard_worker_count] = opts[:workers] if opts[:workers]
16
+ body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
17
+ @options.merge!({body: body.to_json})
18
+
19
+ self.class.put("/jobs/#{job_id}", @options)
20
+ end
21
+
22
+ def cancel(job_id, opts={})
23
+ opts[:status] = 'cancelled'
24
+ update(job_id, opts)
25
+ end
26
+
27
+ def resume(job_id, opts={})
28
+ opts[:status] = 'active'
29
+ update(job_id, opts)
30
+ end
31
+
32
+ def pause(job_id, opts={})
33
+ opts[:status] = 'paused'
34
+ update(job_id, opts)
35
+ end
36
+
37
+ def seeding_update(job_id, opts={})
38
+ body = {}
39
+ body[:outputs] = opts.fetch(:outputs) {[]}
40
+ body[:pages] = opts.fetch(:pages) {[]}
41
+ body[:seeding_status] = opts.fetch(:seeding_status){ nil }
42
+ body[:log_error] = opts[:log_error] if opts[:log_error]
43
+
44
+ @options.merge!({body: body.to_json})
45
+
46
+ self.class.put("/jobs/#{job_id}/seeding_update", @options)
47
+ end
48
+
49
+ end
50
+
51
+ end
52
+ end
53
+
@@ -0,0 +1,10 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class JobExport < AnswersEngine::Client::Base
4
+ def create(job_id, exporter_name)
5
+ self.class.post("/jobs/#{job_id}/exports/#{exporter_name}", @options)
6
+ end
7
+ end
8
+ end
9
+ end
10
+
@@ -0,0 +1,27 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class JobLog < AnswersEngine::Client::Base
4
+ def all_job_page_log(job_id, gid, opts={})
5
+ @options.merge!(opts)
6
+ self.class.get("/jobs/#{job_id}/pages/#{gid}/log", @options)
7
+ end
8
+
9
+ def scraper_all_job_page_log(scraper_name, gid, opts={})
10
+ @options.merge!(opts)
11
+ self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}/log", @options)
12
+ end
13
+
14
+ def all_job_log(job_id, opts={})
15
+ @options.merge!(opts)
16
+ self.class.get("/jobs/#{job_id}/log", @options)
17
+ end
18
+
19
+ def scraper_all_job_log(scraper_name, opts={})
20
+ @options.merge!(opts)
21
+ self.class.get("/scrapers/#{scraper_name}/current_job/log", @options)
22
+ end
23
+
24
+ end
25
+ end
26
+ end
27
+
@@ -0,0 +1,19 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class JobOutput < AnswersEngine::Client::Base
4
+ def find(job_id, collection, id)
5
+ self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records/#{id}", @options)
6
+ end
7
+
8
+ def all(job_id, collection = 'default')
9
+
10
+ self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
11
+ end
12
+
13
+ def collections(job_id)
14
+ self.class.get("/jobs/#{job_id}/output/collections", @options)
15
+ end
16
+ end
17
+ end
18
+ end
19
+
@@ -0,0 +1,62 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class JobPage < AnswersEngine::Client::Base
4
+ def find(job_id, gid)
5
+ self.class.get("/jobs/#{job_id}/pages/#{gid}", @options)
6
+ end
7
+
8
+ def all(job_id, opts={})
9
+ self.class.get("/jobs/#{job_id}/pages", @options)
10
+ end
11
+
12
+ def update(job_id, gid, opts={})
13
+ body = {}
14
+ body[:page_type] = opts[:page_type] if opts[:page_type]
15
+ body[:priority] = opts[:priority] if opts[:priority]
16
+ body[:vars] = opts[:vars] if opts[:vars]
17
+
18
+ @options.merge!({body: body.to_json})
19
+
20
+ self.class.put("/jobs/#{job_id}/pages/#{gid}", @options)
21
+ end
22
+
23
+ def reset(job_id, gid, opts={})
24
+ self.class.put("/jobs/#{job_id}/pages/#{gid}/reset", @options)
25
+ end
26
+
27
+ def enqueue(job_id, method, url, opts={})
28
+ body = {}
29
+ body[:method] = method != "" ? method : "GET"
30
+ body[:url] = url
31
+ body[:page_type] = opts[:page_type] if opts[:page_type]
32
+ body[:priority] = opts[:priority] if opts[:priority]
33
+ body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
34
+ body[:body] = opts[:body] if opts[:body]
35
+ body[:headers] = opts[:headers] if opts[:headers]
36
+ body[:vars] = opts[:vars] if opts[:vars]
37
+ body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
38
+ body[:freshness] = opts[:freshness] if opts[:freshness]
39
+ body[:ua_type] = opts[:ua_type] if opts[:ua_type]
40
+ body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
41
+ body[:cookie] = opts[:cookie] if opts[:cookie]
42
+
43
+ @options.merge!({body: body.to_json})
44
+
45
+ self.class.post("/jobs/#{job_id}/pages", @options)
46
+ end
47
+
48
+ def parsing_update(job_id, gid, opts={})
49
+ body = {}
50
+ body[:outputs] = opts.fetch(:outputs) {[]}
51
+ body[:pages] = opts.fetch(:pages) {[]}
52
+ body[:parsing_status] = opts.fetch(:parsing_status){ nil }
53
+ body[:log_error] = opts[:log_error] if opts[:log_error]
54
+
55
+ @options.merge!({body: body.to_json})
56
+
57
+ self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", @options)
58
+ end
59
+ end
60
+ end
61
+ end
62
+
@@ -0,0 +1,16 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class JobStat < AnswersEngine::Client::Base
4
+
5
+ def job_current_stats(job_id)
6
+ self.class.get("/jobs/#{job_id}/stats/current", @options)
7
+ end
8
+
9
+ def scraper_job_current_stats(scraper_name)
10
+ self.class.get("/scrapers/#{scraper_name}/current_job/stats/current", @options)
11
+ end
12
+
13
+ end
14
+ end
15
+ end
16
+
@@ -0,0 +1,54 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class Scraper < AnswersEngine::Client::Base
4
+
5
+ def find(scraper_name)
6
+ self.class.get("/scrapers/#{scraper_name}", @options)
7
+ end
8
+
9
+ def all(opts={})
10
+ self.class.get("/scrapers", @options)
11
+ end
12
+
13
+ def create(scraper_name, git_repository, opts={})
14
+ body = {
15
+ name: scraper_name,
16
+ git_repository: git_repository,
17
+ git_branch: opts[:branch] ? opts[:branch] : "master"}
18
+
19
+ body[:freshness_type] = opts[:freshness_type] if opts[:freshness_type]
20
+ body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
21
+ body[:standard_worker_count] = opts[:workers] if opts[:workers]
22
+ body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
23
+ body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
24
+ body[:disable_scheduler] = opts[:disable_scheduler] if opts[:disable_scheduler]
25
+ body[:cancel_current_job] = opts[:cancel_current_job] if opts[:cancel_current_job]
26
+ body[:schedule] = opts[:schedule] if opts[:schedule]
27
+ body[:timezone] = opts[:timezone] if opts[:timezone]
28
+ @options.merge!({body: body.to_json})
29
+ self.class.post("/scrapers", @options)
30
+ end
31
+
32
+ def update(scraper_name, opts={})
33
+ body = {}
34
+
35
+ body[:name] = opts[:name] if opts[:name]
36
+ body[:git_repository] = opts[:repo] if opts[:repo]
37
+ body[:git_branch] = opts[:branch] if opts[:branch]
38
+ body[:freshness_type] = opts[:freshness_type] if opts[:freshness_type]
39
+ body[:force_fetch] = opts[:force_fetch] if opts.has_key?("force_fetch")
40
+ body[:standard_worker_count] = opts[:workers] if opts[:workers]
41
+ body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
42
+ body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
43
+ body[:disable_scheduler] = opts[:disable_scheduler] if opts.has_key?("disable_scheduler")
44
+ body[:cancel_current_job] = opts[:cancel_current_job] if opts.has_key?("cancel_current_job")
45
+ body[:schedule] = opts[:schedule] if opts[:schedule]
46
+ body[:timezone] = opts[:timezone] if opts[:timezone]
47
+ @options.merge!({body: body.to_json})
48
+
49
+ self.class.put("/scrapers/#{scraper_name}", @options)
50
+ end
51
+ end
52
+ end
53
+ end
54
+
@@ -0,0 +1,17 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class ScraperDeployment < AnswersEngine::Client::Base
4
+
5
+ def all(scraper_name, opts={})
6
+ self.class.get("/scrapers/#{scraper_name}/deployments", @options)
7
+ end
8
+
9
+
10
+ def deploy(scraper_name, opts={})
11
+ self.class.post("/scrapers/#{scraper_name}/deployments", @options)
12
+ end
13
+
14
+ end
15
+ end
16
+ end
17
+
@@ -0,0 +1,22 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class ScraperExport < AnswersEngine::Client::Base
4
+ def all(scraper_name, opts={})
5
+ self.class.get("/scrapers/#{scraper_name}/exports", @options)
6
+ end
7
+
8
+ def find(export_id)
9
+ self.class.get("/scrapers/exports/#{export_id}", @options)
10
+ end
11
+
12
+ def create(scraper_name, exporter_name)
13
+ self.class.post("/scrapers/#{scraper_name}/exports/#{exporter_name}", @options)
14
+ end
15
+
16
+ def download(export_id)
17
+ self.class.get("/scrapers/exports/#{export_id}/download", @options)
18
+ end
19
+ end
20
+ end
21
+ end
22
+
@@ -0,0 +1,14 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class ScraperExporter < AnswersEngine::Client::Base
4
+ def all(scraper_name, opts={})
5
+ self.class.get("/scrapers/#{scraper_name}/exporters", @options)
6
+ end
7
+
8
+ def find(scraper_name, exporter_name)
9
+ self.class.get("/scrapers/#{scraper_name}/exporters/#{exporter_name}", @options)
10
+ end
11
+ end
12
+ end
13
+ end
14
+
@@ -0,0 +1,49 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class ScraperJob < AnswersEngine::Client::Base
4
+ def all(scraper_name, opts={})
5
+ self.class.get("/scrapers/#{scraper_name}/jobs", @options)
6
+ end
7
+
8
+ def create(scraper_name, opts={})
9
+ body = {}
10
+ body[:standard_worker_count] = opts[:workers] if opts[:workers]
11
+ body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
12
+ body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
13
+ @options.merge!({body: body.to_json})
14
+ self.class.post("/scrapers/#{scraper_name}/jobs", @options)
15
+ end
16
+
17
+ def find(scraper_name)
18
+ self.class.get("/scrapers/#{scraper_name}/current_job", @options)
19
+ end
20
+
21
+ def update(scraper_name, opts={})
22
+ body = {}
23
+ body[:status] = opts[:status] if opts[:status]
24
+ body[:standard_worker_count] = opts[:workers] if opts[:workers]
25
+ body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
26
+ body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
27
+ @options.merge!({body: body.to_json})
28
+
29
+ self.class.put("/scrapers/#{scraper_name}/current_job", @options)
30
+ end
31
+
32
+ def cancel(scraper_name, opts={})
33
+ opts[:status] = 'cancelled'
34
+ update(scraper_name, opts)
35
+ end
36
+
37
+ def resume(scraper_name, opts={})
38
+ opts[:status] = 'active'
39
+ update(scraper_name, opts)
40
+ end
41
+
42
+ def pause(scraper_name, opts={})
43
+ opts[:status] = 'paused'
44
+ update(scraper_name, opts)
45
+ end
46
+ end
47
+ end
48
+ end
49
+