datahen 0.10.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (78) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +29 -0
  8. data/Rakefile +22 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/datahen.gemspec +47 -0
  12. data/examples/fetchtest/libraries/hello.rb +9 -0
  13. data/examples/fetchtest/libraries/hello_fail.rb +10 -0
  14. data/examples/fetchtest/parsers/failed.rb +2 -0
  15. data/examples/fetchtest/parsers/find_outputs.rb +18 -0
  16. data/examples/fetchtest/parsers/home.rb +50 -0
  17. data/examples/fetchtest/parsers/nested_fail.rb +3 -0
  18. data/examples/fetchtest/parsers/simple.rb +14 -0
  19. data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
  20. data/examples/fetchtest/seeders/failed.rb +1 -0
  21. data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
  22. data/examples/fetchtest/seeders/seed.rb +28 -0
  23. data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
  24. data/exe/hen +3 -0
  25. data/lib/datahen.rb +5 -0
  26. data/lib/datahen/cli.rb +45 -0
  27. data/lib/datahen/cli/env_var.rb +48 -0
  28. data/lib/datahen/cli/finisher.rb +40 -0
  29. data/lib/datahen/cli/global_page.rb +39 -0
  30. data/lib/datahen/cli/job.rb +30 -0
  31. data/lib/datahen/cli/job_output.rb +69 -0
  32. data/lib/datahen/cli/parser.rb +64 -0
  33. data/lib/datahen/cli/scraper.rb +185 -0
  34. data/lib/datahen/cli/scraper_deployment.rb +24 -0
  35. data/lib/datahen/cli/scraper_export.rb +51 -0
  36. data/lib/datahen/cli/scraper_exporter.rb +40 -0
  37. data/lib/datahen/cli/scraper_finisher.rb +20 -0
  38. data/lib/datahen/cli/scraper_job.rb +75 -0
  39. data/lib/datahen/cli/scraper_job_var.rb +48 -0
  40. data/lib/datahen/cli/scraper_page.rb +203 -0
  41. data/lib/datahen/cli/scraper_var.rb +48 -0
  42. data/lib/datahen/cli/seeder.rb +40 -0
  43. data/lib/datahen/client.rb +29 -0
  44. data/lib/datahen/client/auth_token.rb +50 -0
  45. data/lib/datahen/client/backblaze_content.rb +45 -0
  46. data/lib/datahen/client/base.rb +69 -0
  47. data/lib/datahen/client/deploy_key.rb +21 -0
  48. data/lib/datahen/client/env_var.rb +28 -0
  49. data/lib/datahen/client/export.rb +10 -0
  50. data/lib/datahen/client/global_page.rb +18 -0
  51. data/lib/datahen/client/job.rb +64 -0
  52. data/lib/datahen/client/job_export.rb +10 -0
  53. data/lib/datahen/client/job_log.rb +26 -0
  54. data/lib/datahen/client/job_output.rb +19 -0
  55. data/lib/datahen/client/job_page.rb +58 -0
  56. data/lib/datahen/client/job_stat.rb +16 -0
  57. data/lib/datahen/client/scraper.rb +57 -0
  58. data/lib/datahen/client/scraper_deployment.rb +18 -0
  59. data/lib/datahen/client/scraper_export.rb +22 -0
  60. data/lib/datahen/client/scraper_exporter.rb +14 -0
  61. data/lib/datahen/client/scraper_finisher.rb +16 -0
  62. data/lib/datahen/client/scraper_job.rb +49 -0
  63. data/lib/datahen/client/scraper_job_output.rb +19 -0
  64. data/lib/datahen/client/scraper_job_page.rb +67 -0
  65. data/lib/datahen/client/scraper_job_var.rb +28 -0
  66. data/lib/datahen/client/scraper_var.rb +28 -0
  67. data/lib/datahen/plugin.rb +6 -0
  68. data/lib/datahen/plugin/context_exposer.rb +55 -0
  69. data/lib/datahen/scraper.rb +18 -0
  70. data/lib/datahen/scraper/executor.rb +373 -0
  71. data/lib/datahen/scraper/finisher.rb +18 -0
  72. data/lib/datahen/scraper/parser.rb +18 -0
  73. data/lib/datahen/scraper/ruby_finisher_executor.rb +116 -0
  74. data/lib/datahen/scraper/ruby_parser_executor.rb +200 -0
  75. data/lib/datahen/scraper/ruby_seeder_executor.rb +120 -0
  76. data/lib/datahen/scraper/seeder.rb +18 -0
  77. data/lib/datahen/version.rb +3 -0
  78. metadata +270 -0
@@ -0,0 +1,26 @@
1
+ module Datahen
2
+ module Client
3
+ class JobLog < Datahen::Client::Base
4
+ def all_job_page_log(job_id, gid, opts={})
5
+ params = @options.merge(opts)
6
+ self.class.get("/jobs/#{job_id}/pages/#{gid}/log", params)
7
+ end
8
+
9
+ def scraper_all_job_page_log(scraper_name, gid, opts={})
10
+ params = @options.merge(opts)
11
+ self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}/log", params)
12
+ end
13
+
14
+ def all_job_log(job_id, opts={})
15
+ params = @options.merge(opts)
16
+ self.class.get("/jobs/#{job_id}/log", params)
17
+ end
18
+
19
+ def scraper_all_job_log(scraper_name, opts={})
20
+ params = @options.merge(opts)
21
+ self.class.get("/scrapers/#{scraper_name}/current_job/log", params)
22
+ end
23
+
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,19 @@
1
+ module Datahen
2
+ module Client
3
+ class JobOutput < Datahen::Client::Base
4
+ def find(job_id, collection, id)
5
+ self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records/#{id}", @options)
6
+ end
7
+
8
+ def all(job_id, collection = 'default')
9
+
10
+ self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
11
+ end
12
+
13
+ def collections(job_id)
14
+ self.class.get("/jobs/#{job_id}/output/collections", @options)
15
+ end
16
+ end
17
+ end
18
+ end
19
+
@@ -0,0 +1,58 @@
1
+ module Datahen
2
+ module Client
3
+ class JobPage < Datahen::Client::Base
4
+ def find(job_id, gid)
5
+ self.class.get("/jobs/#{job_id}/pages/#{gid}", @options)
6
+ end
7
+
8
+ def all(job_id, opts={})
9
+ params = @options.merge(opts)
10
+ self.class.get("/jobs/#{job_id}/pages", params)
11
+ end
12
+
13
+ def update(job_id, gid, opts={})
14
+ body = {}
15
+ body[:page_type] = opts[:page_type] if opts[:page_type]
16
+ body[:priority] = opts[:priority] if opts[:priority]
17
+ body[:vars] = opts[:vars] if opts[:vars]
18
+
19
+ params = @options.merge({body: body.to_json})
20
+
21
+ self.class.put("/jobs/#{job_id}/pages/#{gid}", params)
22
+ end
23
+
24
+ def enqueue(job_id, method, url, opts={})
25
+ body = {}
26
+ body[:method] = method != "" ? method : "GET"
27
+ body[:url] = url
28
+ body[:page_type] = opts[:page_type] if opts[:page_type]
29
+ body[:priority] = opts[:priority] if opts[:priority]
30
+ body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
31
+ body[:body] = opts[:body] if opts[:body]
32
+ body[:headers] = opts[:headers] if opts[:headers]
33
+ body[:vars] = opts[:vars] if opts[:vars]
34
+ body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
35
+ body[:freshness] = opts[:freshness] if opts[:freshness]
36
+ body[:ua_type] = opts[:ua_type] if opts[:ua_type]
37
+ body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
38
+ body[:cookie] = opts[:cookie] if opts[:cookie]
39
+
40
+ params = @options.merge({body: body.to_json})
41
+
42
+ self.class.post("/jobs/#{job_id}/pages", params)
43
+ end
44
+
45
+ def parsing_update(job_id, gid, opts={})
46
+ body = {}
47
+ body[:outputs] = opts.fetch(:outputs) {[]}
48
+ body[:pages] = opts.fetch(:pages) {[]}
49
+ body[:parsing_status] = opts.fetch(:parsing_status){ nil }
50
+ body[:log_error] = opts[:log_error] if opts[:log_error]
51
+
52
+ params = @options.merge({body: body.to_json})
53
+
54
+ self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,16 @@
1
+ module Datahen
2
+ module Client
3
+ class JobStat < Datahen::Client::Base
4
+
5
+ def job_current_stats(job_id)
6
+ self.class.get("/jobs/#{job_id}/stats/current", @options)
7
+ end
8
+
9
+ def scraper_job_current_stats(scraper_name)
10
+ self.class.get("/scrapers/#{scraper_name}/current_job/stats/current", @options)
11
+ end
12
+
13
+ end
14
+ end
15
+ end
16
+
@@ -0,0 +1,57 @@
1
+ module Datahen
2
+ module Client
3
+ class Scraper < Datahen::Client::Base
4
+
5
+ def find(scraper_name)
6
+ self.class.get("/scrapers/#{scraper_name}", @options)
7
+ end
8
+
9
+ def all(opts={})
10
+ params = @options.merge opts
11
+ self.class.get("/scrapers", params)
12
+ end
13
+
14
+ def create(scraper_name, git_repository, opts={})
15
+ body = {}
16
+ body[:name] = scraper_name
17
+ body[:git_repository] = git_repository
18
+ body[:git_branch] = opts[:branch] || opts[:git_branch] || "master" if opts[:branch] || opts[:git_branch]
19
+ body[:freshness_type] = opts[:freshness_type] if opts[:freshness_type]
20
+ body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
21
+ body[:standard_worker_count] = opts[:workers] || opts[:standard_worker_count] if opts[:workers] || opts[:standard_worker_count]
22
+ body[:browser_worker_count] = opts[:browsers] || opts[:browser_worker_count] if opts[:browsers] || opts[:browser_worker_count]
23
+ body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
24
+ body[:disable_scheduler] = opts[:disable_scheduler] if opts[:disable_scheduler]
25
+ body[:cancel_current_job] = opts[:cancel_current_job] if opts[:cancel_current_job]
26
+ body[:schedule] = opts[:schedule] if opts[:schedule]
27
+ body[:timezone] = opts[:timezone] if opts[:timezone]
28
+ params = @options.merge({body: body.to_json})
29
+ self.class.post("/scrapers", params)
30
+ end
31
+
32
+ def update(scraper_name, opts={})
33
+ body = {}
34
+ body[:name] = opts[:name] if opts[:name]
35
+ body[:git_repository] = opts[:repo] || opts[:git_repository] if opts[:repo] || opts[:git_repository]
36
+ body[:git_branch] = opts[:branch] || opts[:git_branch] if opts[:branch] || opts[:git_branch]
37
+ body[:freshness_type] = opts[:freshness_type] if opts[:freshness_type]
38
+ body[:force_fetch] = opts[:force_fetch] if opts.has_key?("force_fetch") || opts.has_key?(:force_fetch)
39
+ body[:standard_worker_count] = opts[:workers] || opts[:standard_worker_count] if opts[:workers] || opts[:standard_worker_count]
40
+ body[:browser_worker_count] = opts[:browsers] || opts[:browser_worker_count] if opts[:browsers] || opts[:browser_worker_count]
41
+ body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
42
+ body[:disable_scheduler] = opts[:disable_scheduler] if opts.has_key?("disable_scheduler") || opts.has_key?(:disable_scheduler)
43
+ body[:cancel_current_job] = opts[:cancel_current_job] if opts.has_key?("cancel_current_job") || opts.has_key?(:cancel_current_job)
44
+ body[:schedule] = opts[:schedule] if opts[:schedule]
45
+ body[:timezone] = opts[:timezone] if opts[:timezone]
46
+ params = @options.merge({body: body.to_json})
47
+
48
+ self.class.put("/scrapers/#{scraper_name}", params)
49
+ end
50
+
51
+ def delete(scraper_name, opts={})
52
+ params = @options.merge(opts)
53
+ self.class.delete("/scrapers/#{scraper_name}", params)
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,18 @@
1
+ module Datahen
2
+ module Client
3
+ class ScraperDeployment < Datahen::Client::Base
4
+
5
+ def all(scraper_name, opts={})
6
+ params = @options.merge(opts)
7
+ self.class.get("/scrapers/#{scraper_name}/deployments", params)
8
+ end
9
+
10
+
11
+ def deploy(scraper_name, opts={})
12
+ params = @options.merge(opts)
13
+ self.class.post("/scrapers/#{scraper_name}/deployments", params)
14
+ end
15
+
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,22 @@
1
+ module Datahen
2
+ module Client
3
+ class ScraperExport < Datahen::Client::Base
4
+ def all(scraper_name, opts={})
5
+ params = @options.merge(opts)
6
+ self.class.get("/scrapers/#{scraper_name}/exports", params)
7
+ end
8
+
9
+ def find(export_id)
10
+ self.class.get("/scrapers/exports/#{export_id}", @options)
11
+ end
12
+
13
+ def create(scraper_name, exporter_name)
14
+ self.class.post("/scrapers/#{scraper_name}/exports/#{exporter_name}", @options)
15
+ end
16
+
17
+ def download(export_id)
18
+ self.class.get("/scrapers/exports/#{export_id}/download", @options)
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,14 @@
1
+ module Datahen
2
+ module Client
3
+ class ScraperExporter < Datahen::Client::Base
4
+ def all(scraper_name, opts={})
5
+ params = @options.merge(opts)
6
+ self.class.get("/scrapers/#{scraper_name}/exporters", params)
7
+ end
8
+
9
+ def find(scraper_name, exporter_name)
10
+ self.class.get("/scrapers/#{scraper_name}/exporters/#{exporter_name}", @options)
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,16 @@
1
+ module Datahen
2
+ module Client
3
+ class ScraperFinisher < Datahen::Client::Base
4
+ # Reset finisher on a scraper's current job.
5
+ #
6
+ # @param [String] scraper_name Scraper name.
7
+ # @param [Hash] opts ({}) API custom parameters.
8
+ #
9
+ # @return [HTTParty::Response]
10
+ def reset(scraper_name, opts={})
11
+ params = @options.merge(opts)
12
+ self.class.put("/scrapers/#{scraper_name}/current_job/finisher/reset", params)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,49 @@
1
+ module Datahen
2
+ module Client
3
+ class ScraperJob < Datahen::Client::Base
4
+ def all(scraper_name, opts={})
5
+ params = @options.merge(opts)
6
+ self.class.get("/scrapers/#{scraper_name}/jobs", params)
7
+ end
8
+
9
+ def create(scraper_name, opts={})
10
+ body = {}
11
+ body[:standard_worker_count] = opts[:workers] if opts[:workers]
12
+ body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
13
+ body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
14
+ params = @options.merge({body: body.to_json})
15
+ self.class.post("/scrapers/#{scraper_name}/jobs", params)
16
+ end
17
+
18
+ def find(scraper_name)
19
+ self.class.get("/scrapers/#{scraper_name}/current_job", @options)
20
+ end
21
+
22
+ def update(scraper_name, opts={})
23
+ body = {}
24
+ body[:status] = opts[:status] if opts[:status]
25
+ body[:standard_worker_count] = opts[:workers] if opts[:workers]
26
+ body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
27
+ body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
28
+ params = @options.merge({body: body.to_json})
29
+
30
+ self.class.put("/scrapers/#{scraper_name}/current_job", params)
31
+ end
32
+
33
+ def cancel(scraper_name, opts={})
34
+ opts[:status] = 'cancelled'
35
+ update(scraper_name, opts)
36
+ end
37
+
38
+ def resume(scraper_name, opts={})
39
+ opts[:status] = 'active'
40
+ update(scraper_name, opts)
41
+ end
42
+
43
+ def pause(scraper_name, opts={})
44
+ opts[:status] = 'paused'
45
+ update(scraper_name, opts)
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,19 @@
1
+ module Datahen
2
+ module Client
3
+ class ScraperJobOutput < Datahen::Client::Base
4
+ def find(scraper_name, collection, id)
5
+ self.class.get("/scrapers/#{scraper_name}/current_job/output/collections/#{collection}/records/#{id}", @options)
6
+ end
7
+
8
+ def all(scraper_name, collection = 'default')
9
+
10
+ self.class.get("/scrapers/#{scraper_name}/current_job/output/collections/#{collection}/records", @options)
11
+ end
12
+
13
+ def collections(scraper_name)
14
+ self.class.get("/scrapers/#{scraper_name}/current_job/output/collections", @options)
15
+ end
16
+ end
17
+ end
18
+ end
19
+
@@ -0,0 +1,67 @@
1
+ module Datahen
2
+ module Client
3
+ class ScraperJobPage < Datahen::Client::Base
4
+ def find(scraper_name, gid)
5
+ self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}", @options)
6
+ end
7
+
8
+ def all(scraper_name, opts={})
9
+ params = @options.merge(opts)
10
+ self.class.get("/scrapers/#{scraper_name}/current_job/pages", params)
11
+ end
12
+
13
+ def update(scraper_name, gid, opts={})
14
+ body = {}
15
+ body[:page_type] = opts[:page_type] if opts[:page_type]
16
+ body[:priority] = opts[:priority] if opts[:priority]
17
+ body[:vars] = opts[:vars] if opts[:vars]
18
+
19
+ params = @options.merge({body: body.to_json})
20
+
21
+ self.class.put("/scrapers/#{scraper_name}/current_job/pages/#{gid}", params)
22
+ end
23
+
24
+ def refetch(scraper_name, opts={})
25
+ params = @options.merge(opts)
26
+ self.class.put("/scrapers/#{scraper_name}/current_job/pages/refetch", params)
27
+ end
28
+
29
+ def refetch_by_job(job_id, opts={})
30
+ params = @options.merge(opts)
31
+ self.class.put("/jobs/#{job_id}/pages/refetch", params)
32
+ end
33
+
34
+ def reparse(scraper_name, opts={})
35
+ params = @options.merge(opts)
36
+ self.class.put("/scrapers/#{scraper_name}/current_job/pages/reparse", params)
37
+ end
38
+
39
+ def reparse_by_job(job_id, opts={})
40
+ params = @options.merge(opts)
41
+ self.class.put("/jobs/#{job_id}/pages/reparse", params)
42
+ end
43
+
44
+ def enqueue(scraper_name, method, url, opts={})
45
+ body = {}
46
+ body[:method] = method != "" ? method : "GET"
47
+ body[:url] = url
48
+ body[:page_type] = opts[:page_type] if opts[:page_type]
49
+ body[:priority] = opts[:priority] if opts[:priority]
50
+ body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
51
+ body[:body] = opts[:body] if opts[:body]
52
+ body[:headers] = opts[:headers] if opts[:headers]
53
+ body[:vars] = opts[:vars] if opts[:vars]
54
+ body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
55
+ body[:freshness] = opts[:freshness] if opts[:freshness]
56
+ body[:ua_type] = opts[:ua_type] if opts[:ua_type]
57
+ body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
58
+ body[:cookie] = opts[:cookie] if opts[:cookie]
59
+
60
+ params = @options.merge({body: body.to_json})
61
+
62
+ self.class.post("/scrapers/#{scraper_name}/current_job/pages", params)
63
+ end
64
+
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,28 @@
1
+ module Datahen
2
+ module Client
3
+ class ScraperJobVar < Datahen::Client::Base
4
+
5
+ def find(scraper_name, var_name)
6
+ self.class.get("/scrapers/#{scraper_name}/current_job/vars/#{var_name}", @options)
7
+ end
8
+
9
+ def all(scraper_name, opts={})
10
+ params = @options.merge opts
11
+ self.class.get("/scrapers/#{scraper_name}/current_job/vars", params)
12
+ end
13
+
14
+ def set(scraper_name, var_name, value, opts={})
15
+ body = {}
16
+ body[:value] = value
17
+ body[:secret] = opts[:secret] if opts[:secret]
18
+ params = @options.merge({body: body.to_json})
19
+ self.class.put("/scrapers/#{scraper_name}/current_job/vars/#{var_name}", params)
20
+ end
21
+
22
+ def unset(scraper_name, var_name, opts={})
23
+ params = @options.merge(opts)
24
+ self.class.delete("/scrapers/#{scraper_name}/current_job/vars/#{var_name}", params)
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,28 @@
1
+ module Datahen
2
+ module Client
3
+ class ScraperVar < Datahen::Client::Base
4
+
5
+ def find(scraper_name, var_name)
6
+ self.class.get("/scrapers/#{scraper_name}/vars/#{var_name}", @options)
7
+ end
8
+
9
+ def all(scraper_name, opts={})
10
+ params = @options.merge opts
11
+ self.class.get("/scrapers/#{scraper_name}/vars", params)
12
+ end
13
+
14
+ def set(scraper_name, var_name, value, opts={})
15
+ body = {}
16
+ body[:value] = value
17
+ body[:secret] = opts[:secret] if opts[:secret]
18
+ params = @options.merge({body: body.to_json})
19
+ self.class.put("/scrapers/#{scraper_name}/vars/#{var_name}", params)
20
+ end
21
+
22
+ def unset(scraper_name, var_name, opts={})
23
+ params = @options.merge(opts)
24
+ self.class.delete("/scrapers/#{scraper_name}/vars/#{var_name}", params)
25
+ end
26
+ end
27
+ end
28
+ end