answersengine 0.10.1 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/CODE_OF_CONDUCT.md +1 -1
  3. data/LICENSE.txt +1 -1
  4. data/README.md +3 -4
  5. data/answersengine.gemspec +6 -12
  6. data/exe/answersengine +3 -2
  7. data/lib/answersengine.rb +20 -3
  8. metadata +14 -152
  9. data/examples/fetchtest/libraries/hello.rb +0 -9
  10. data/examples/fetchtest/libraries/hello_fail.rb +0 -10
  11. data/examples/fetchtest/parsers/failed.rb +0 -2
  12. data/examples/fetchtest/parsers/find_outputs.rb +0 -18
  13. data/examples/fetchtest/parsers/home.rb +0 -50
  14. data/examples/fetchtest/parsers/nested_fail.rb +0 -3
  15. data/examples/fetchtest/parsers/simple.rb +0 -14
  16. data/examples/fetchtest/seeders/csv_seeder.rb +0 -12
  17. data/examples/fetchtest/seeders/failed.rb +0 -1
  18. data/examples/fetchtest/seeders/list_of_urls.csv +0 -5
  19. data/examples/fetchtest/seeders/seed.rb +0 -28
  20. data/examples/fetchtest/seeders/test_reset_page.rb +0 -4
  21. data/lib/answersengine/cli.rb +0 -45
  22. data/lib/answersengine/cli/env_var.rb +0 -48
  23. data/lib/answersengine/cli/finisher.rb +0 -40
  24. data/lib/answersengine/cli/global_page.rb +0 -39
  25. data/lib/answersengine/cli/job.rb +0 -30
  26. data/lib/answersengine/cli/job_output.rb +0 -69
  27. data/lib/answersengine/cli/parser.rb +0 -64
  28. data/lib/answersengine/cli/scraper.rb +0 -185
  29. data/lib/answersengine/cli/scraper_deployment.rb +0 -24
  30. data/lib/answersengine/cli/scraper_export.rb +0 -51
  31. data/lib/answersengine/cli/scraper_exporter.rb +0 -40
  32. data/lib/answersengine/cli/scraper_finisher.rb +0 -20
  33. data/lib/answersengine/cli/scraper_job.rb +0 -75
  34. data/lib/answersengine/cli/scraper_job_var.rb +0 -48
  35. data/lib/answersengine/cli/scraper_page.rb +0 -203
  36. data/lib/answersengine/cli/scraper_var.rb +0 -48
  37. data/lib/answersengine/cli/seeder.rb +0 -40
  38. data/lib/answersengine/client.rb +0 -29
  39. data/lib/answersengine/client/auth_token.rb +0 -50
  40. data/lib/answersengine/client/backblaze_content.rb +0 -45
  41. data/lib/answersengine/client/base.rb +0 -55
  42. data/lib/answersengine/client/deploy_key.rb +0 -21
  43. data/lib/answersengine/client/env_var.rb +0 -28
  44. data/lib/answersengine/client/export.rb +0 -10
  45. data/lib/answersengine/client/global_page.rb +0 -18
  46. data/lib/answersengine/client/job.rb +0 -64
  47. data/lib/answersengine/client/job_export.rb +0 -10
  48. data/lib/answersengine/client/job_log.rb +0 -26
  49. data/lib/answersengine/client/job_output.rb +0 -19
  50. data/lib/answersengine/client/job_page.rb +0 -58
  51. data/lib/answersengine/client/job_stat.rb +0 -16
  52. data/lib/answersengine/client/scraper.rb +0 -57
  53. data/lib/answersengine/client/scraper_deployment.rb +0 -18
  54. data/lib/answersengine/client/scraper_export.rb +0 -22
  55. data/lib/answersengine/client/scraper_exporter.rb +0 -14
  56. data/lib/answersengine/client/scraper_finisher.rb +0 -16
  57. data/lib/answersengine/client/scraper_job.rb +0 -49
  58. data/lib/answersengine/client/scraper_job_output.rb +0 -19
  59. data/lib/answersengine/client/scraper_job_page.rb +0 -67
  60. data/lib/answersengine/client/scraper_job_var.rb +0 -28
  61. data/lib/answersengine/client/scraper_var.rb +0 -28
  62. data/lib/answersengine/plugin.rb +0 -6
  63. data/lib/answersengine/plugin/context_exposer.rb +0 -55
  64. data/lib/answersengine/scraper.rb +0 -18
  65. data/lib/answersengine/scraper/executor.rb +0 -373
  66. data/lib/answersengine/scraper/finisher.rb +0 -18
  67. data/lib/answersengine/scraper/parser.rb +0 -18
  68. data/lib/answersengine/scraper/ruby_finisher_executor.rb +0 -116
  69. data/lib/answersengine/scraper/ruby_parser_executor.rb +0 -200
  70. data/lib/answersengine/scraper/ruby_seeder_executor.rb +0 -120
  71. data/lib/answersengine/scraper/seeder.rb +0 -18
  72. data/lib/answersengine/version.rb +0 -3
@@ -1,19 +0,0 @@
1
- module AnswersEngine
2
- module Client
3
- class JobOutput < AnswersEngine::Client::Base
4
- def find(job_id, collection, id)
5
- self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records/#{id}", @options)
6
- end
7
-
8
- def all(job_id, collection = 'default')
9
-
10
- self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
11
- end
12
-
13
- def collections(job_id)
14
- self.class.get("/jobs/#{job_id}/output/collections", @options)
15
- end
16
- end
17
- end
18
- end
19
-
@@ -1,58 +0,0 @@
1
- module AnswersEngine
2
- module Client
3
- class JobPage < AnswersEngine::Client::Base
4
- def find(job_id, gid)
5
- self.class.get("/jobs/#{job_id}/pages/#{gid}", @options)
6
- end
7
-
8
- def all(job_id, opts={})
9
- params = @options.merge(opts)
10
- self.class.get("/jobs/#{job_id}/pages", params)
11
- end
12
-
13
- def update(job_id, gid, opts={})
14
- body = {}
15
- body[:page_type] = opts[:page_type] if opts[:page_type]
16
- body[:priority] = opts[:priority] if opts[:priority]
17
- body[:vars] = opts[:vars] if opts[:vars]
18
-
19
- params = @options.merge({body: body.to_json})
20
-
21
- self.class.put("/jobs/#{job_id}/pages/#{gid}", params)
22
- end
23
-
24
- def enqueue(job_id, method, url, opts={})
25
- body = {}
26
- body[:method] = method != "" ? method : "GET"
27
- body[:url] = url
28
- body[:page_type] = opts[:page_type] if opts[:page_type]
29
- body[:priority] = opts[:priority] if opts[:priority]
30
- body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
31
- body[:body] = opts[:body] if opts[:body]
32
- body[:headers] = opts[:headers] if opts[:headers]
33
- body[:vars] = opts[:vars] if opts[:vars]
34
- body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
35
- body[:freshness] = opts[:freshness] if opts[:freshness]
36
- body[:ua_type] = opts[:ua_type] if opts[:ua_type]
37
- body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
38
- body[:cookie] = opts[:cookie] if opts[:cookie]
39
-
40
- params = @options.merge({body: body.to_json})
41
-
42
- self.class.post("/jobs/#{job_id}/pages", params)
43
- end
44
-
45
- def parsing_update(job_id, gid, opts={})
46
- body = {}
47
- body[:outputs] = opts.fetch(:outputs) {[]}
48
- body[:pages] = opts.fetch(:pages) {[]}
49
- body[:parsing_status] = opts.fetch(:parsing_status){ nil }
50
- body[:log_error] = opts[:log_error] if opts[:log_error]
51
-
52
- params = @options.merge({body: body.to_json})
53
-
54
- self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
55
- end
56
- end
57
- end
58
- end
@@ -1,16 +0,0 @@
1
- module AnswersEngine
2
- module Client
3
- class JobStat < AnswersEngine::Client::Base
4
-
5
- def job_current_stats(job_id)
6
- self.class.get("/jobs/#{job_id}/stats/current", @options)
7
- end
8
-
9
- def scraper_job_current_stats(scraper_name)
10
- self.class.get("/scrapers/#{scraper_name}/current_job/stats/current", @options)
11
- end
12
-
13
- end
14
- end
15
- end
16
-
@@ -1,57 +0,0 @@
1
- module AnswersEngine
2
- module Client
3
- class Scraper < AnswersEngine::Client::Base
4
-
5
- def find(scraper_name)
6
- self.class.get("/scrapers/#{scraper_name}", @options)
7
- end
8
-
9
- def all(opts={})
10
- params = @options.merge opts
11
- self.class.get("/scrapers", params)
12
- end
13
-
14
- def create(scraper_name, git_repository, opts={})
15
- body = {}
16
- body[:name] = scraper_name
17
- body[:git_repository] = git_repository
18
- body[:git_branch] = opts[:branch] || opts[:git_branch] || "master" if opts[:branch] || opts[:git_branch]
19
- body[:freshness_type] = opts[:freshness_type] if opts[:freshness_type]
20
- body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
21
- body[:standard_worker_count] = opts[:workers] || opts[:standard_worker_count] if opts[:workers] || opts[:standard_worker_count]
22
- body[:browser_worker_count] = opts[:browsers] || opts[:browser_worker_count] if opts[:browsers] || opts[:browser_worker_count]
23
- body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
24
- body[:disable_scheduler] = opts[:disable_scheduler] if opts[:disable_scheduler]
25
- body[:cancel_current_job] = opts[:cancel_current_job] if opts[:cancel_current_job]
26
- body[:schedule] = opts[:schedule] if opts[:schedule]
27
- body[:timezone] = opts[:timezone] if opts[:timezone]
28
- params = @options.merge({body: body.to_json})
29
- self.class.post("/scrapers", params)
30
- end
31
-
32
- def update(scraper_name, opts={})
33
- body = {}
34
- body[:name] = opts[:name] if opts[:name]
35
- body[:git_repository] = opts[:repo] || opts[:git_repository] if opts[:repo] || opts[:git_repository]
36
- body[:git_branch] = opts[:branch] || opts[:git_branch] if opts[:branch] || opts[:git_branch]
37
- body[:freshness_type] = opts[:freshness_type] if opts[:freshness_type]
38
- body[:force_fetch] = opts[:force_fetch] if opts.has_key?("force_fetch") || opts.has_key?(:force_fetch)
39
- body[:standard_worker_count] = opts[:workers] || opts[:standard_worker_count] if opts[:workers] || opts[:standard_worker_count]
40
- body[:browser_worker_count] = opts[:browsers] || opts[:browser_worker_count] if opts[:browsers] || opts[:browser_worker_count]
41
- body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
42
- body[:disable_scheduler] = opts[:disable_scheduler] if opts.has_key?("disable_scheduler") || opts.has_key?(:disable_scheduler)
43
- body[:cancel_current_job] = opts[:cancel_current_job] if opts.has_key?("cancel_current_job") || opts.has_key?(:cancel_current_job)
44
- body[:schedule] = opts[:schedule] if opts[:schedule]
45
- body[:timezone] = opts[:timezone] if opts[:timezone]
46
- params = @options.merge({body: body.to_json})
47
-
48
- self.class.put("/scrapers/#{scraper_name}", params)
49
- end
50
-
51
- def delete(scraper_name, opts={})
52
- params = @options.merge(opts)
53
- self.class.delete("/scrapers/#{scraper_name}", params)
54
- end
55
- end
56
- end
57
- end
@@ -1,18 +0,0 @@
1
- module AnswersEngine
2
- module Client
3
- class ScraperDeployment < AnswersEngine::Client::Base
4
-
5
- def all(scraper_name, opts={})
6
- params = @options.merge(opts)
7
- self.class.get("/scrapers/#{scraper_name}/deployments", params)
8
- end
9
-
10
-
11
- def deploy(scraper_name, opts={})
12
- params = @options.merge(opts)
13
- self.class.post("/scrapers/#{scraper_name}/deployments", params)
14
- end
15
-
16
- end
17
- end
18
- end
@@ -1,22 +0,0 @@
1
- module AnswersEngine
2
- module Client
3
- class ScraperExport < AnswersEngine::Client::Base
4
- def all(scraper_name, opts={})
5
- params = @options.merge(opts)
6
- self.class.get("/scrapers/#{scraper_name}/exports", params)
7
- end
8
-
9
- def find(export_id)
10
- self.class.get("/scrapers/exports/#{export_id}", @options)
11
- end
12
-
13
- def create(scraper_name, exporter_name)
14
- self.class.post("/scrapers/#{scraper_name}/exports/#{exporter_name}", @options)
15
- end
16
-
17
- def download(export_id)
18
- self.class.get("/scrapers/exports/#{export_id}/download", @options)
19
- end
20
- end
21
- end
22
- end
@@ -1,14 +0,0 @@
1
- module AnswersEngine
2
- module Client
3
- class ScraperExporter < AnswersEngine::Client::Base
4
- def all(scraper_name, opts={})
5
- params = @options.merge(opts)
6
- self.class.get("/scrapers/#{scraper_name}/exporters", params)
7
- end
8
-
9
- def find(scraper_name, exporter_name)
10
- self.class.get("/scrapers/#{scraper_name}/exporters/#{exporter_name}", @options)
11
- end
12
- end
13
- end
14
- end
@@ -1,16 +0,0 @@
1
- module AnswersEngine
2
- module Client
3
- class ScraperFinisher < AnswersEngine::Client::Base
4
- # Reset finisher on a scraper's current job.
5
- #
6
- # @param [String] scraper_name Scraper name.
7
- # @param [Hash] opts ({}) API custom parameters.
8
- #
9
- # @return [HTTParty::Response]
10
- def reset(scraper_name, opts={})
11
- params = @options.merge(opts)
12
- self.class.put("/scrapers/#{scraper_name}/current_job/finisher/reset", params)
13
- end
14
- end
15
- end
16
- end
@@ -1,49 +0,0 @@
1
- module AnswersEngine
2
- module Client
3
- class ScraperJob < AnswersEngine::Client::Base
4
- def all(scraper_name, opts={})
5
- params = @options.merge(opts)
6
- self.class.get("/scrapers/#{scraper_name}/jobs", params)
7
- end
8
-
9
- def create(scraper_name, opts={})
10
- body = {}
11
- body[:standard_worker_count] = opts[:workers] if opts[:workers]
12
- body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
13
- body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
14
- params = @options.merge({body: body.to_json})
15
- self.class.post("/scrapers/#{scraper_name}/jobs", params)
16
- end
17
-
18
- def find(scraper_name)
19
- self.class.get("/scrapers/#{scraper_name}/current_job", @options)
20
- end
21
-
22
- def update(scraper_name, opts={})
23
- body = {}
24
- body[:status] = opts[:status] if opts[:status]
25
- body[:standard_worker_count] = opts[:workers] if opts[:workers]
26
- body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
27
- body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
28
- params = @options.merge({body: body.to_json})
29
-
30
- self.class.put("/scrapers/#{scraper_name}/current_job", params)
31
- end
32
-
33
- def cancel(scraper_name, opts={})
34
- opts[:status] = 'cancelled'
35
- update(scraper_name, opts)
36
- end
37
-
38
- def resume(scraper_name, opts={})
39
- opts[:status] = 'active'
40
- update(scraper_name, opts)
41
- end
42
-
43
- def pause(scraper_name, opts={})
44
- opts[:status] = 'paused'
45
- update(scraper_name, opts)
46
- end
47
- end
48
- end
49
- end
@@ -1,19 +0,0 @@
1
- module AnswersEngine
2
- module Client
3
- class ScraperJobOutput < AnswersEngine::Client::Base
4
- def find(scraper_name, collection, id)
5
- self.class.get("/scrapers/#{scraper_name}/current_job/output/collections/#{collection}/records/#{id}", @options)
6
- end
7
-
8
- def all(scraper_name, collection = 'default')
9
-
10
- self.class.get("/scrapers/#{scraper_name}/current_job/output/collections/#{collection}/records", @options)
11
- end
12
-
13
- def collections(scraper_name)
14
- self.class.get("/scrapers/#{scraper_name}/current_job/output/collections", @options)
15
- end
16
- end
17
- end
18
- end
19
-
@@ -1,67 +0,0 @@
1
- module AnswersEngine
2
- module Client
3
- class ScraperJobPage < AnswersEngine::Client::Base
4
- def find(scraper_name, gid)
5
- self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}", @options)
6
- end
7
-
8
- def all(scraper_name, opts={})
9
- params = @options.merge(opts)
10
- self.class.get("/scrapers/#{scraper_name}/current_job/pages", params)
11
- end
12
-
13
- def update(scraper_name, gid, opts={})
14
- body = {}
15
- body[:page_type] = opts[:page_type] if opts[:page_type]
16
- body[:priority] = opts[:priority] if opts[:priority]
17
- body[:vars] = opts[:vars] if opts[:vars]
18
-
19
- params = @options.merge({body: body.to_json})
20
-
21
- self.class.put("/scrapers/#{scraper_name}/current_job/pages/#{gid}", params)
22
- end
23
-
24
- def refetch(scraper_name, opts={})
25
- params = @options.merge(opts)
26
- self.class.put("/scrapers/#{scraper_name}/current_job/pages/refetch", params)
27
- end
28
-
29
- def refetch_by_job(job_id, opts={})
30
- params = @options.merge(opts)
31
- self.class.put("/jobs/#{job_id}/pages/refetch", params)
32
- end
33
-
34
- def reparse(scraper_name, opts={})
35
- params = @options.merge(opts)
36
- self.class.put("/scrapers/#{scraper_name}/current_job/pages/reparse", params)
37
- end
38
-
39
- def reparse_by_job(job_id, opts={})
40
- params = @options.merge(opts)
41
- self.class.put("/jobs/#{job_id}/pages/reparse", params)
42
- end
43
-
44
- def enqueue(scraper_name, method, url, opts={})
45
- body = {}
46
- body[:method] = method != "" ? method : "GET"
47
- body[:url] = url
48
- body[:page_type] = opts[:page_type] if opts[:page_type]
49
- body[:priority] = opts[:priority] if opts[:priority]
50
- body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
51
- body[:body] = opts[:body] if opts[:body]
52
- body[:headers] = opts[:headers] if opts[:headers]
53
- body[:vars] = opts[:vars] if opts[:vars]
54
- body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
55
- body[:freshness] = opts[:freshness] if opts[:freshness]
56
- body[:ua_type] = opts[:ua_type] if opts[:ua_type]
57
- body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
58
- body[:cookie] = opts[:cookie] if opts[:cookie]
59
-
60
- params = @options.merge({body: body.to_json})
61
-
62
- self.class.post("/scrapers/#{scraper_name}/current_job/pages", params)
63
- end
64
-
65
- end
66
- end
67
- end
@@ -1,28 +0,0 @@
1
- module AnswersEngine
2
- module Client
3
- class ScraperJobVar < AnswersEngine::Client::Base
4
-
5
- def find(scraper_name, var_name)
6
- self.class.get("/scrapers/#{scraper_name}/current_job/vars/#{var_name}", @options)
7
- end
8
-
9
- def all(scraper_name, opts={})
10
- params = @options.merge opts
11
- self.class.get("/scrapers/#{scraper_name}/current_job/vars", params)
12
- end
13
-
14
- def set(scraper_name, var_name, value, opts={})
15
- body = {}
16
- body[:value] = value
17
- body[:secret] = opts[:secret] if opts[:secret]
18
- params = @options.merge({body: body.to_json})
19
- self.class.put("/scrapers/#{scraper_name}/current_job/vars/#{var_name}", params)
20
- end
21
-
22
- def unset(scraper_name, var_name, opts={})
23
- params = @options.merge(opts)
24
- self.class.delete("/scrapers/#{scraper_name}/current_job/vars/#{var_name}", params)
25
- end
26
- end
27
- end
28
- end
@@ -1,28 +0,0 @@
1
- module AnswersEngine
2
- module Client
3
- class ScraperVar < AnswersEngine::Client::Base
4
-
5
- def find(scraper_name, var_name)
6
- self.class.get("/scrapers/#{scraper_name}/vars/#{var_name}", @options)
7
- end
8
-
9
- def all(scraper_name, opts={})
10
- params = @options.merge opts
11
- self.class.get("/scrapers/#{scraper_name}/vars", params)
12
- end
13
-
14
- def set(scraper_name, var_name, value, opts={})
15
- body = {}
16
- body[:value] = value
17
- body[:secret] = opts[:secret] if opts[:secret]
18
- params = @options.merge({body: body.to_json})
19
- self.class.put("/scrapers/#{scraper_name}/vars/#{var_name}", params)
20
- end
21
-
22
- def unset(scraper_name, var_name, opts={})
23
- params = @options.merge(opts)
24
- self.class.delete("/scrapers/#{scraper_name}/vars/#{var_name}", params)
25
- end
26
- end
27
- end
28
- end
@@ -1,6 +0,0 @@
1
- require 'answersengine/plugin/context_exposer'
2
-
3
- module AnswersEngine
4
- module Plugin
5
- end
6
- end
@@ -1,55 +0,0 @@
1
- module AnswersEngine
2
- module Plugin
3
- module ContextExposer
4
- def self.exposed_methods
5
- raise NotImplementedError.new('Specify methods exposed to isolated env')
6
- end
7
-
8
- def exposed_methods
9
- self.class.exposed_methods
10
- end
11
-
12
- # Create lambda to retrieve a variable or call instance method
13
- def var_or_proc vars, key
14
- myself = self # Avoid stack overflow
15
- return lambda{vars[key]} if vars.has_key?(key)
16
- lambda{|*args| myself.send(key, *args)}
17
- end
18
-
19
- def exposed_env vars
20
- keys = exposed_methods + vars.keys
21
- Hash[keys.uniq.map{|key|[key, var_or_proc(vars, key)]}]
22
- end
23
-
24
- def expose_to object, env
25
- metaclass = class << object; self; end
26
- env.each do |key, block|
27
- metaclass.send(:define_method, key, block)
28
- end
29
- object
30
- end
31
-
32
- # Create isolated context object from self
33
- def create_context vars = {}
34
- create_top_object_script = '(
35
- lambda do
36
- object = Object.new
37
- metaclass = class << object
38
- define_method(:context_binding){binding}
39
- end
40
- object
41
- end
42
- ).call'
43
- object = TOPLEVEL_BINDING.eval(create_top_object_script)
44
- env = exposed_env(vars)
45
- expose_to object, env
46
- object
47
- end
48
-
49
- # Create an isolated binding
50
- def isolated_binding vars = {}
51
- create_context(vars).context_binding
52
- end
53
- end
54
- end
55
- end