answersengine 0.2.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +30 -0
  8. data/Rakefile +22 -0
  9. data/answersengine.gemspec +45 -0
  10. data/bin/console +14 -0
  11. data/bin/setup +8 -0
  12. data/examples/fetchtest/libraries/hello.rb +9 -0
  13. data/examples/fetchtest/libraries/hello_fail.rb +10 -0
  14. data/examples/fetchtest/parsers/failed.rb +2 -0
  15. data/examples/fetchtest/parsers/find_outputs.rb +18 -0
  16. data/examples/fetchtest/parsers/home.rb +50 -0
  17. data/examples/fetchtest/parsers/nested_fail.rb +3 -0
  18. data/examples/fetchtest/parsers/simple.rb +14 -0
  19. data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
  20. data/examples/fetchtest/seeders/failed.rb +1 -0
  21. data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
  22. data/examples/fetchtest/seeders/seed.rb +28 -0
  23. data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
  24. data/exe/answersengine +3 -0
  25. data/lib/answersengine.rb +5 -0
  26. data/lib/answersengine/cli.rb +33 -0
  27. data/lib/answersengine/cli/global_page.rb +39 -0
  28. data/lib/answersengine/cli/job.rb +30 -0
  29. data/lib/answersengine/cli/job_output.rb +69 -0
  30. data/lib/answersengine/cli/parser.rb +64 -0
  31. data/lib/answersengine/cli/scraper.rb +172 -0
  32. data/lib/answersengine/cli/scraper_deployment.rb +24 -0
  33. data/lib/answersengine/cli/scraper_export.rb +51 -0
  34. data/lib/answersengine/cli/scraper_exporter.rb +40 -0
  35. data/lib/answersengine/cli/scraper_job.rb +71 -0
  36. data/lib/answersengine/cli/scraper_page.rb +200 -0
  37. data/lib/answersengine/cli/seeder.rb +40 -0
  38. data/lib/answersengine/client.rb +23 -0
  39. data/lib/answersengine/client/backblaze_content.rb +45 -0
  40. data/lib/answersengine/client/base.rb +50 -0
  41. data/lib/answersengine/client/export.rb +10 -0
  42. data/lib/answersengine/client/global_page.rb +18 -0
  43. data/lib/answersengine/client/job.rb +53 -0
  44. data/lib/answersengine/client/job_export.rb +10 -0
  45. data/lib/answersengine/client/job_log.rb +27 -0
  46. data/lib/answersengine/client/job_output.rb +19 -0
  47. data/lib/answersengine/client/job_page.rb +62 -0
  48. data/lib/answersengine/client/job_stat.rb +16 -0
  49. data/lib/answersengine/client/scraper.rb +54 -0
  50. data/lib/answersengine/client/scraper_deployment.rb +17 -0
  51. data/lib/answersengine/client/scraper_export.rb +22 -0
  52. data/lib/answersengine/client/scraper_exporter.rb +14 -0
  53. data/lib/answersengine/client/scraper_job.rb +49 -0
  54. data/lib/answersengine/client/scraper_job_output.rb +19 -0
  55. data/lib/answersengine/client/scraper_job_page.rb +55 -0
  56. data/lib/answersengine/plugin.rb +6 -0
  57. data/lib/answersengine/plugin/context_exposer.rb +55 -0
  58. data/lib/answersengine/scraper.rb +16 -0
  59. data/lib/answersengine/scraper/executor.rb +292 -0
  60. data/lib/answersengine/scraper/parser.rb +18 -0
  61. data/lib/answersengine/scraper/ruby_parser_executor.rb +141 -0
  62. data/lib/answersengine/scraper/ruby_seeder_executor.rb +114 -0
  63. data/lib/answersengine/scraper/seeder.rb +18 -0
  64. data/lib/answersengine/version.rb +3 -0
  65. metadata +255 -0
@@ -0,0 +1,50 @@
1
+ require 'httparty'
2
+
3
+ module AnswersEngine
4
+ module Client
5
+ class Base
6
+ include HTTParty
7
+ base_uri(ENV['ANSWERSENGINE_API_URL'].nil? ? 'https://fetch.answersengine.com/api/v1' : ENV['ANSWERSENGINE_API_URL'])
8
+
9
+ def self.env_auth_token
10
+ ENV['ANSWERSENGINE_TOKEN']
11
+ end
12
+
13
+ def auth_token
14
+ @auth_token ||= self.class.env_auth_token
15
+ end
16
+
17
+ def auth_token= value
18
+ @auth_token = value
19
+ end
20
+
21
+ def initialize(opts={})
22
+ self.auth_token = opts[:auth_token] unless opts[:auth_token].nil?
23
+ @options = { headers: {
24
+ "Authorization" => "Bearer #{auth_token}",
25
+ "Content-Type" => "application/json",
26
+ }}
27
+
28
+ query = {}
29
+ query[:p] = opts[:page] if opts[:page]
30
+ query[:pp] = opts[:per_page] if opts[:per_page]
31
+ query[:fetchfail] = opts[:fetch_fail] if opts[:fetch_fail]
32
+ query[:parsefail] = opts[:parse_fail] if opts[:parse_fail]
33
+ query[:page_type] = opts[:page_type] if opts[:page_type]
34
+ query[:gid] = opts[:gid] if opts[:gid]
35
+
36
+ if opts[:query]
37
+ if opts[:query].is_a?(Hash)
38
+ query[:q] = opts[:query].to_json
39
+ elsif opts[:query].is_a?(String)
40
+ query[:q] = JSON.parse(opts[:query]).to_json
41
+ end
42
+ end
43
+
44
+ unless query.empty?
45
+ @options.merge!(query: query)
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,10 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class Export < AnswersEngine::Client::Base
4
+ def all(opts={})
5
+ self.class.get("/scrapers/exports", @options)
6
+ end
7
+ end
8
+ end
9
+ end
10
+
@@ -0,0 +1,18 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class GlobalPage < AnswersEngine::Client::Base
4
+ def find(gid)
5
+ self.class.get("/global_pages/#{gid}", @options)
6
+ end
7
+
8
+ def find_content(gid)
9
+ self.class.get("/global_pages/#{gid}/content", @options)
10
+ end
11
+
12
+ def find_failed_content(gid)
13
+ self.class.get("/global_pages/#{gid}/failed_content", @options)
14
+ end
15
+ end
16
+ end
17
+ end
18
+
@@ -0,0 +1,53 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class Job < AnswersEngine::Client::Base
4
+ def all(opts={})
5
+ self.class.get("/jobs", @options)
6
+ end
7
+
8
+ def find(job_id)
9
+ self.class.get("/jobs/#{job_id}", @options)
10
+ end
11
+
12
+ def update(job_id, opts={})
13
+ body = {}
14
+ body[:status] = opts[:status] if opts[:status]
15
+ body[:standard_worker_count] = opts[:workers] if opts[:workers]
16
+ body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
17
+ @options.merge!({body: body.to_json})
18
+
19
+ self.class.put("/jobs/#{job_id}", @options)
20
+ end
21
+
22
+ def cancel(job_id, opts={})
23
+ opts[:status] = 'cancelled'
24
+ update(job_id, opts)
25
+ end
26
+
27
+ def resume(job_id, opts={})
28
+ opts[:status] = 'active'
29
+ update(job_id, opts)
30
+ end
31
+
32
+ def pause(job_id, opts={})
33
+ opts[:status] = 'paused'
34
+ update(job_id, opts)
35
+ end
36
+
37
+ def seeding_update(job_id, opts={})
38
+ body = {}
39
+ body[:outputs] = opts.fetch(:outputs) {[]}
40
+ body[:pages] = opts.fetch(:pages) {[]}
41
+ body[:seeding_status] = opts.fetch(:seeding_status){ nil }
42
+ body[:log_error] = opts[:log_error] if opts[:log_error]
43
+
44
+ @options.merge!({body: body.to_json})
45
+
46
+ self.class.put("/jobs/#{job_id}/seeding_update", @options)
47
+ end
48
+
49
+ end
50
+
51
+ end
52
+ end
53
+
@@ -0,0 +1,10 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class JobExport < AnswersEngine::Client::Base
4
+ def create(job_id, exporter_name)
5
+ self.class.post("/jobs/#{job_id}/exports/#{exporter_name}", @options)
6
+ end
7
+ end
8
+ end
9
+ end
10
+
@@ -0,0 +1,27 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class JobLog < AnswersEngine::Client::Base
4
+ def all_job_page_log(job_id, gid, opts={})
5
+ @options.merge!(opts)
6
+ self.class.get("/jobs/#{job_id}/pages/#{gid}/log", @options)
7
+ end
8
+
9
+ def scraper_all_job_page_log(scraper_name, gid, opts={})
10
+ @options.merge!(opts)
11
+ self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}/log", @options)
12
+ end
13
+
14
+ def all_job_log(job_id, opts={})
15
+ @options.merge!(opts)
16
+ self.class.get("/jobs/#{job_id}/log", @options)
17
+ end
18
+
19
+ def scraper_all_job_log(scraper_name, opts={})
20
+ @options.merge!(opts)
21
+ self.class.get("/scrapers/#{scraper_name}/current_job/log", @options)
22
+ end
23
+
24
+ end
25
+ end
26
+ end
27
+
@@ -0,0 +1,19 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class JobOutput < AnswersEngine::Client::Base
4
+ def find(job_id, collection, id)
5
+ self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records/#{id}", @options)
6
+ end
7
+
8
+ def all(job_id, collection = 'default')
9
+
10
+ self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
11
+ end
12
+
13
+ def collections(job_id)
14
+ self.class.get("/jobs/#{job_id}/output/collections", @options)
15
+ end
16
+ end
17
+ end
18
+ end
19
+
@@ -0,0 +1,62 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class JobPage < AnswersEngine::Client::Base
4
+ def find(job_id, gid)
5
+ self.class.get("/jobs/#{job_id}/pages/#{gid}", @options)
6
+ end
7
+
8
+ def all(job_id, opts={})
9
+ self.class.get("/jobs/#{job_id}/pages", @options)
10
+ end
11
+
12
+ def update(job_id, gid, opts={})
13
+ body = {}
14
+ body[:page_type] = opts[:page_type] if opts[:page_type]
15
+ body[:priority] = opts[:priority] if opts[:priority]
16
+ body[:vars] = opts[:vars] if opts[:vars]
17
+
18
+ @options.merge!({body: body.to_json})
19
+
20
+ self.class.put("/jobs/#{job_id}/pages/#{gid}", @options)
21
+ end
22
+
23
+ def reset(job_id, gid, opts={})
24
+ self.class.put("/jobs/#{job_id}/pages/#{gid}/reset", @options)
25
+ end
26
+
27
+ def enqueue(job_id, method, url, opts={})
28
+ body = {}
29
+ body[:method] = method != "" ? method : "GET"
30
+ body[:url] = url
31
+ body[:page_type] = opts[:page_type] if opts[:page_type]
32
+ body[:priority] = opts[:priority] if opts[:priority]
33
+ body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
34
+ body[:body] = opts[:body] if opts[:body]
35
+ body[:headers] = opts[:headers] if opts[:headers]
36
+ body[:vars] = opts[:vars] if opts[:vars]
37
+ body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
38
+ body[:freshness] = opts[:freshness] if opts[:freshness]
39
+ body[:ua_type] = opts[:ua_type] if opts[:ua_type]
40
+ body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
41
+ body[:cookie] = opts[:cookie] if opts[:cookie]
42
+
43
+ @options.merge!({body: body.to_json})
44
+
45
+ self.class.post("/jobs/#{job_id}/pages", @options)
46
+ end
47
+
48
+ def parsing_update(job_id, gid, opts={})
49
+ body = {}
50
+ body[:outputs] = opts.fetch(:outputs) {[]}
51
+ body[:pages] = opts.fetch(:pages) {[]}
52
+ body[:parsing_status] = opts.fetch(:parsing_status){ nil }
53
+ body[:log_error] = opts[:log_error] if opts[:log_error]
54
+
55
+ @options.merge!({body: body.to_json})
56
+
57
+ self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", @options)
58
+ end
59
+ end
60
+ end
61
+ end
62
+
@@ -0,0 +1,16 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class JobStat < AnswersEngine::Client::Base
4
+
5
+ def job_current_stats(job_id)
6
+ self.class.get("/jobs/#{job_id}/stats/current", @options)
7
+ end
8
+
9
+ def scraper_job_current_stats(scraper_name)
10
+ self.class.get("/scrapers/#{scraper_name}/current_job/stats/current", @options)
11
+ end
12
+
13
+ end
14
+ end
15
+ end
16
+
@@ -0,0 +1,54 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class Scraper < AnswersEngine::Client::Base
4
+
5
+ def find(scraper_name)
6
+ self.class.get("/scrapers/#{scraper_name}", @options)
7
+ end
8
+
9
+ def all(opts={})
10
+ self.class.get("/scrapers", @options)
11
+ end
12
+
13
+ def create(scraper_name, git_repository, opts={})
14
+ body = {
15
+ name: scraper_name,
16
+ git_repository: git_repository,
17
+ git_branch: opts[:branch] ? opts[:branch] : "master"}
18
+
19
+ body[:freshness_type] = opts[:freshness_type] if opts[:freshness_type]
20
+ body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
21
+ body[:standard_worker_count] = opts[:workers] if opts[:workers]
22
+ body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
23
+ body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
24
+ body[:disable_scheduler] = opts[:disable_scheduler] if opts[:disable_scheduler]
25
+ body[:cancel_current_job] = opts[:cancel_current_job] if opts[:cancel_current_job]
26
+ body[:schedule] = opts[:schedule] if opts[:schedule]
27
+ body[:timezone] = opts[:timezone] if opts[:timezone]
28
+ @options.merge!({body: body.to_json})
29
+ self.class.post("/scrapers", @options)
30
+ end
31
+
32
+ def update(scraper_name, opts={})
33
+ body = {}
34
+
35
+ body[:name] = opts[:name] if opts[:name]
36
+ body[:git_repository] = opts[:repo] if opts[:repo]
37
+ body[:git_branch] = opts[:branch] if opts[:branch]
38
+ body[:freshness_type] = opts[:freshness_type] if opts[:freshness_type]
39
+ body[:force_fetch] = opts[:force_fetch] if opts.has_key?("force_fetch")
40
+ body[:standard_worker_count] = opts[:workers] if opts[:workers]
41
+ body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
42
+ body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
43
+ body[:disable_scheduler] = opts[:disable_scheduler] if opts.has_key?("disable_scheduler")
44
+ body[:cancel_current_job] = opts[:cancel_current_job] if opts.has_key?("cancel_current_job")
45
+ body[:schedule] = opts[:schedule] if opts[:schedule]
46
+ body[:timezone] = opts[:timezone] if opts[:timezone]
47
+ @options.merge!({body: body.to_json})
48
+
49
+ self.class.put("/scrapers/#{scraper_name}", @options)
50
+ end
51
+ end
52
+ end
53
+ end
54
+
@@ -0,0 +1,17 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class ScraperDeployment < AnswersEngine::Client::Base
4
+
5
+ def all(scraper_name, opts={})
6
+ self.class.get("/scrapers/#{scraper_name}/deployments", @options)
7
+ end
8
+
9
+
10
+ def deploy(scraper_name, opts={})
11
+ self.class.post("/scrapers/#{scraper_name}/deployments", @options)
12
+ end
13
+
14
+ end
15
+ end
16
+ end
17
+
@@ -0,0 +1,22 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class ScraperExport < AnswersEngine::Client::Base
4
+ def all(scraper_name, opts={})
5
+ self.class.get("/scrapers/#{scraper_name}/exports", @options)
6
+ end
7
+
8
+ def find(export_id)
9
+ self.class.get("/scrapers/exports/#{export_id}", @options)
10
+ end
11
+
12
+ def create(scraper_name, exporter_name)
13
+ self.class.post("/scrapers/#{scraper_name}/exports/#{exporter_name}", @options)
14
+ end
15
+
16
+ def download(export_id)
17
+ self.class.get("/scrapers/exports/#{export_id}/download", @options)
18
+ end
19
+ end
20
+ end
21
+ end
22
+
@@ -0,0 +1,14 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class ScraperExporter < AnswersEngine::Client::Base
4
+ def all(scraper_name, opts={})
5
+ self.class.get("/scrapers/#{scraper_name}/exporters", @options)
6
+ end
7
+
8
+ def find(scraper_name, exporter_name)
9
+ self.class.get("/scrapers/#{scraper_name}/exporters/#{exporter_name}", @options)
10
+ end
11
+ end
12
+ end
13
+ end
14
+
@@ -0,0 +1,49 @@
1
+ module AnswersEngine
2
+ module Client
3
+ class ScraperJob < AnswersEngine::Client::Base
4
+ def all(scraper_name, opts={})
5
+ self.class.get("/scrapers/#{scraper_name}/jobs", @options)
6
+ end
7
+
8
+ def create(scraper_name, opts={})
9
+ body = {}
10
+ body[:standard_worker_count] = opts[:workers] if opts[:workers]
11
+ body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
12
+ body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
13
+ @options.merge!({body: body.to_json})
14
+ self.class.post("/scrapers/#{scraper_name}/jobs", @options)
15
+ end
16
+
17
+ def find(scraper_name)
18
+ self.class.get("/scrapers/#{scraper_name}/current_job", @options)
19
+ end
20
+
21
+ def update(scraper_name, opts={})
22
+ body = {}
23
+ body[:status] = opts[:status] if opts[:status]
24
+ body[:standard_worker_count] = opts[:workers] if opts[:workers]
25
+ body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
26
+ body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
27
+ @options.merge!({body: body.to_json})
28
+
29
+ self.class.put("/scrapers/#{scraper_name}/current_job", @options)
30
+ end
31
+
32
+ def cancel(scraper_name, opts={})
33
+ opts[:status] = 'cancelled'
34
+ update(scraper_name, opts)
35
+ end
36
+
37
+ def resume(scraper_name, opts={})
38
+ opts[:status] = 'active'
39
+ update(scraper_name, opts)
40
+ end
41
+
42
+ def pause(scraper_name, opts={})
43
+ opts[:status] = 'paused'
44
+ update(scraper_name, opts)
45
+ end
46
+ end
47
+ end
48
+ end
49
+