datahen 0.10.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +29 -0
  8. data/Rakefile +22 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/datahen.gemspec +47 -0
  12. data/examples/fetchtest/libraries/hello.rb +9 -0
  13. data/examples/fetchtest/libraries/hello_fail.rb +10 -0
  14. data/examples/fetchtest/parsers/failed.rb +2 -0
  15. data/examples/fetchtest/parsers/find_outputs.rb +18 -0
  16. data/examples/fetchtest/parsers/home.rb +50 -0
  17. data/examples/fetchtest/parsers/nested_fail.rb +3 -0
  18. data/examples/fetchtest/parsers/simple.rb +14 -0
  19. data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
  20. data/examples/fetchtest/seeders/failed.rb +1 -0
  21. data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
  22. data/examples/fetchtest/seeders/seed.rb +28 -0
  23. data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
  24. data/exe/hen +3 -0
  25. data/lib/datahen.rb +5 -0
  26. data/lib/datahen/cli.rb +45 -0
  27. data/lib/datahen/cli/env_var.rb +48 -0
  28. data/lib/datahen/cli/finisher.rb +40 -0
  29. data/lib/datahen/cli/global_page.rb +39 -0
  30. data/lib/datahen/cli/job.rb +30 -0
  31. data/lib/datahen/cli/job_output.rb +69 -0
  32. data/lib/datahen/cli/parser.rb +64 -0
  33. data/lib/datahen/cli/scraper.rb +185 -0
  34. data/lib/datahen/cli/scraper_deployment.rb +24 -0
  35. data/lib/datahen/cli/scraper_export.rb +51 -0
  36. data/lib/datahen/cli/scraper_exporter.rb +40 -0
  37. data/lib/datahen/cli/scraper_finisher.rb +20 -0
  38. data/lib/datahen/cli/scraper_job.rb +75 -0
  39. data/lib/datahen/cli/scraper_job_var.rb +48 -0
  40. data/lib/datahen/cli/scraper_page.rb +203 -0
  41. data/lib/datahen/cli/scraper_var.rb +48 -0
  42. data/lib/datahen/cli/seeder.rb +40 -0
  43. data/lib/datahen/client.rb +29 -0
  44. data/lib/datahen/client/auth_token.rb +50 -0
  45. data/lib/datahen/client/backblaze_content.rb +45 -0
  46. data/lib/datahen/client/base.rb +69 -0
  47. data/lib/datahen/client/deploy_key.rb +21 -0
  48. data/lib/datahen/client/env_var.rb +28 -0
  49. data/lib/datahen/client/export.rb +10 -0
  50. data/lib/datahen/client/global_page.rb +18 -0
  51. data/lib/datahen/client/job.rb +64 -0
  52. data/lib/datahen/client/job_export.rb +10 -0
  53. data/lib/datahen/client/job_log.rb +26 -0
  54. data/lib/datahen/client/job_output.rb +19 -0
  55. data/lib/datahen/client/job_page.rb +58 -0
  56. data/lib/datahen/client/job_stat.rb +16 -0
  57. data/lib/datahen/client/scraper.rb +57 -0
  58. data/lib/datahen/client/scraper_deployment.rb +18 -0
  59. data/lib/datahen/client/scraper_export.rb +22 -0
  60. data/lib/datahen/client/scraper_exporter.rb +14 -0
  61. data/lib/datahen/client/scraper_finisher.rb +16 -0
  62. data/lib/datahen/client/scraper_job.rb +49 -0
  63. data/lib/datahen/client/scraper_job_output.rb +19 -0
  64. data/lib/datahen/client/scraper_job_page.rb +67 -0
  65. data/lib/datahen/client/scraper_job_var.rb +28 -0
  66. data/lib/datahen/client/scraper_var.rb +28 -0
  67. data/lib/datahen/plugin.rb +6 -0
  68. data/lib/datahen/plugin/context_exposer.rb +55 -0
  69. data/lib/datahen/scraper.rb +18 -0
  70. data/lib/datahen/scraper/executor.rb +373 -0
  71. data/lib/datahen/scraper/finisher.rb +18 -0
  72. data/lib/datahen/scraper/parser.rb +18 -0
  73. data/lib/datahen/scraper/ruby_finisher_executor.rb +116 -0
  74. data/lib/datahen/scraper/ruby_parser_executor.rb +200 -0
  75. data/lib/datahen/scraper/ruby_seeder_executor.rb +120 -0
  76. data/lib/datahen/scraper/seeder.rb +18 -0
  77. data/lib/datahen/version.rb +3 -0
  78. metadata +270 -0
@@ -0,0 +1,26 @@
1
+ module Datahen
2
+ module Client
3
+ class JobLog < Datahen::Client::Base
4
+ def all_job_page_log(job_id, gid, opts={})
5
+ params = @options.merge(opts)
6
+ self.class.get("/jobs/#{job_id}/pages/#{gid}/log", params)
7
+ end
8
+
9
+ def scraper_all_job_page_log(scraper_name, gid, opts={})
10
+ params = @options.merge(opts)
11
+ self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}/log", params)
12
+ end
13
+
14
+ def all_job_log(job_id, opts={})
15
+ params = @options.merge(opts)
16
+ self.class.get("/jobs/#{job_id}/log", params)
17
+ end
18
+
19
+ def scraper_all_job_log(scraper_name, opts={})
20
+ params = @options.merge(opts)
21
+ self.class.get("/scrapers/#{scraper_name}/current_job/log", params)
22
+ end
23
+
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,19 @@
1
+ module Datahen
2
+ module Client
3
+ class JobOutput < Datahen::Client::Base
4
+ def find(job_id, collection, id)
5
+ self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records/#{id}", @options)
6
+ end
7
+
8
+ def all(job_id, collection = 'default')
9
+
10
+ self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
11
+ end
12
+
13
+ def collections(job_id)
14
+ self.class.get("/jobs/#{job_id}/output/collections", @options)
15
+ end
16
+ end
17
+ end
18
+ end
19
+
@@ -0,0 +1,58 @@
1
+ module Datahen
2
+ module Client
3
+ class JobPage < Datahen::Client::Base
4
+ def find(job_id, gid)
5
+ self.class.get("/jobs/#{job_id}/pages/#{gid}", @options)
6
+ end
7
+
8
+ def all(job_id, opts={})
9
+ params = @options.merge(opts)
10
+ self.class.get("/jobs/#{job_id}/pages", params)
11
+ end
12
+
13
+ def update(job_id, gid, opts={})
14
+ body = {}
15
+ body[:page_type] = opts[:page_type] if opts[:page_type]
16
+ body[:priority] = opts[:priority] if opts[:priority]
17
+ body[:vars] = opts[:vars] if opts[:vars]
18
+
19
+ params = @options.merge({body: body.to_json})
20
+
21
+ self.class.put("/jobs/#{job_id}/pages/#{gid}", params)
22
+ end
23
+
24
+ def enqueue(job_id, method, url, opts={})
25
+ body = {}
26
+ body[:method] = method != "" ? method : "GET"
27
+ body[:url] = url
28
+ body[:page_type] = opts[:page_type] if opts[:page_type]
29
+ body[:priority] = opts[:priority] if opts[:priority]
30
+ body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
31
+ body[:body] = opts[:body] if opts[:body]
32
+ body[:headers] = opts[:headers] if opts[:headers]
33
+ body[:vars] = opts[:vars] if opts[:vars]
34
+ body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
35
+ body[:freshness] = opts[:freshness] if opts[:freshness]
36
+ body[:ua_type] = opts[:ua_type] if opts[:ua_type]
37
+ body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
38
+ body[:cookie] = opts[:cookie] if opts[:cookie]
39
+
40
+ params = @options.merge({body: body.to_json})
41
+
42
+ self.class.post("/jobs/#{job_id}/pages", params)
43
+ end
44
+
45
+ def parsing_update(job_id, gid, opts={})
46
+ body = {}
47
+ body[:outputs] = opts.fetch(:outputs) {[]}
48
+ body[:pages] = opts.fetch(:pages) {[]}
49
+ body[:parsing_status] = opts.fetch(:parsing_status){ nil }
50
+ body[:log_error] = opts[:log_error] if opts[:log_error]
51
+
52
+ params = @options.merge({body: body.to_json})
53
+
54
+ self.class.put("/jobs/#{job_id}/pages/#{gid}/parsing_update", params)
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,16 @@
1
+ module Datahen
2
+ module Client
3
+ class JobStat < Datahen::Client::Base
4
+
5
+ def job_current_stats(job_id)
6
+ self.class.get("/jobs/#{job_id}/stats/current", @options)
7
+ end
8
+
9
+ def scraper_job_current_stats(scraper_name)
10
+ self.class.get("/scrapers/#{scraper_name}/current_job/stats/current", @options)
11
+ end
12
+
13
+ end
14
+ end
15
+ end
16
+
@@ -0,0 +1,57 @@
1
+ module Datahen
2
+ module Client
3
+ class Scraper < Datahen::Client::Base
4
+
5
+ def find(scraper_name)
6
+ self.class.get("/scrapers/#{scraper_name}", @options)
7
+ end
8
+
9
+ def all(opts={})
10
+ params = @options.merge opts
11
+ self.class.get("/scrapers", params)
12
+ end
13
+
14
+ def create(scraper_name, git_repository, opts={})
15
+ body = {}
16
+ body[:name] = scraper_name
17
+ body[:git_repository] = git_repository
18
+ body[:git_branch] = opts[:branch] || opts[:git_branch] || "master" if opts[:branch] || opts[:git_branch]
19
+ body[:freshness_type] = opts[:freshness_type] if opts[:freshness_type]
20
+ body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
21
+ body[:standard_worker_count] = opts[:workers] || opts[:standard_worker_count] if opts[:workers] || opts[:standard_worker_count]
22
+ body[:browser_worker_count] = opts[:browsers] || opts[:browser_worker_count] if opts[:browsers] || opts[:browser_worker_count]
23
+ body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
24
+ body[:disable_scheduler] = opts[:disable_scheduler] if opts[:disable_scheduler]
25
+ body[:cancel_current_job] = opts[:cancel_current_job] if opts[:cancel_current_job]
26
+ body[:schedule] = opts[:schedule] if opts[:schedule]
27
+ body[:timezone] = opts[:timezone] if opts[:timezone]
28
+ params = @options.merge({body: body.to_json})
29
+ self.class.post("/scrapers", params)
30
+ end
31
+
32
+ def update(scraper_name, opts={})
33
+ body = {}
34
+ body[:name] = opts[:name] if opts[:name]
35
+ body[:git_repository] = opts[:repo] || opts[:git_repository] if opts[:repo] || opts[:git_repository]
36
+ body[:git_branch] = opts[:branch] || opts[:git_branch] if opts[:branch] || opts[:git_branch]
37
+ body[:freshness_type] = opts[:freshness_type] if opts[:freshness_type]
38
+ body[:force_fetch] = opts[:force_fetch] if opts.has_key?("force_fetch") || opts.has_key?(:force_fetch)
39
+ body[:standard_worker_count] = opts[:workers] || opts[:standard_worker_count] if opts[:workers] || opts[:standard_worker_count]
40
+ body[:browser_worker_count] = opts[:browsers] || opts[:browser_worker_count] if opts[:browsers] || opts[:browser_worker_count]
41
+ body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
42
+ body[:disable_scheduler] = opts[:disable_scheduler] if opts.has_key?("disable_scheduler") || opts.has_key?(:disable_scheduler)
43
+ body[:cancel_current_job] = opts[:cancel_current_job] if opts.has_key?("cancel_current_job") || opts.has_key?(:cancel_current_job)
44
+ body[:schedule] = opts[:schedule] if opts[:schedule]
45
+ body[:timezone] = opts[:timezone] if opts[:timezone]
46
+ params = @options.merge({body: body.to_json})
47
+
48
+ self.class.put("/scrapers/#{scraper_name}", params)
49
+ end
50
+
51
+ def delete(scraper_name, opts={})
52
+ params = @options.merge(opts)
53
+ self.class.delete("/scrapers/#{scraper_name}", params)
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,18 @@
1
+ module Datahen
2
+ module Client
3
+ class ScraperDeployment < Datahen::Client::Base
4
+
5
+ def all(scraper_name, opts={})
6
+ params = @options.merge(opts)
7
+ self.class.get("/scrapers/#{scraper_name}/deployments", params)
8
+ end
9
+
10
+
11
+ def deploy(scraper_name, opts={})
12
+ params = @options.merge(opts)
13
+ self.class.post("/scrapers/#{scraper_name}/deployments", params)
14
+ end
15
+
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,22 @@
1
+ module Datahen
2
+ module Client
3
+ class ScraperExport < Datahen::Client::Base
4
+ def all(scraper_name, opts={})
5
+ params = @options.merge(opts)
6
+ self.class.get("/scrapers/#{scraper_name}/exports", params)
7
+ end
8
+
9
+ def find(export_id)
10
+ self.class.get("/scrapers/exports/#{export_id}", @options)
11
+ end
12
+
13
+ def create(scraper_name, exporter_name)
14
+ self.class.post("/scrapers/#{scraper_name}/exports/#{exporter_name}", @options)
15
+ end
16
+
17
+ def download(export_id)
18
+ self.class.get("/scrapers/exports/#{export_id}/download", @options)
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,14 @@
1
+ module Datahen
2
+ module Client
3
+ class ScraperExporter < Datahen::Client::Base
4
+ def all(scraper_name, opts={})
5
+ params = @options.merge(opts)
6
+ self.class.get("/scrapers/#{scraper_name}/exporters", params)
7
+ end
8
+
9
+ def find(scraper_name, exporter_name)
10
+ self.class.get("/scrapers/#{scraper_name}/exporters/#{exporter_name}", @options)
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,16 @@
1
+ module Datahen
2
+ module Client
3
+ class ScraperFinisher < Datahen::Client::Base
4
+ # Reset finisher on a scraper's current job.
5
+ #
6
+ # @param [String] scraper_name Scraper name.
7
+ # @param [Hash] opts ({}) API custom parameters.
8
+ #
9
+ # @return [HTTParty::Response]
10
+ def reset(scraper_name, opts={})
11
+ params = @options.merge(opts)
12
+ self.class.put("/scrapers/#{scraper_name}/current_job/finisher/reset", params)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,49 @@
1
+ module Datahen
2
+ module Client
3
+ class ScraperJob < Datahen::Client::Base
4
+ def all(scraper_name, opts={})
5
+ params = @options.merge(opts)
6
+ self.class.get("/scrapers/#{scraper_name}/jobs", params)
7
+ end
8
+
9
+ def create(scraper_name, opts={})
10
+ body = {}
11
+ body[:standard_worker_count] = opts[:workers] if opts[:workers]
12
+ body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
13
+ body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
14
+ params = @options.merge({body: body.to_json})
15
+ self.class.post("/scrapers/#{scraper_name}/jobs", params)
16
+ end
17
+
18
+ def find(scraper_name)
19
+ self.class.get("/scrapers/#{scraper_name}/current_job", @options)
20
+ end
21
+
22
+ def update(scraper_name, opts={})
23
+ body = {}
24
+ body[:status] = opts[:status] if opts[:status]
25
+ body[:standard_worker_count] = opts[:workers] if opts[:workers]
26
+ body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
27
+ body[:proxy_type] = opts[:proxy_type] if opts[:proxy_type]
28
+ params = @options.merge({body: body.to_json})
29
+
30
+ self.class.put("/scrapers/#{scraper_name}/current_job", params)
31
+ end
32
+
33
+ def cancel(scraper_name, opts={})
34
+ opts[:status] = 'cancelled'
35
+ update(scraper_name, opts)
36
+ end
37
+
38
+ def resume(scraper_name, opts={})
39
+ opts[:status] = 'active'
40
+ update(scraper_name, opts)
41
+ end
42
+
43
+ def pause(scraper_name, opts={})
44
+ opts[:status] = 'paused'
45
+ update(scraper_name, opts)
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,19 @@
1
+ module Datahen
2
+ module Client
3
+ class ScraperJobOutput < Datahen::Client::Base
4
+ def find(scraper_name, collection, id)
5
+ self.class.get("/scrapers/#{scraper_name}/current_job/output/collections/#{collection}/records/#{id}", @options)
6
+ end
7
+
8
+ def all(scraper_name, collection = 'default')
9
+
10
+ self.class.get("/scrapers/#{scraper_name}/current_job/output/collections/#{collection}/records", @options)
11
+ end
12
+
13
+ def collections(scraper_name)
14
+ self.class.get("/scrapers/#{scraper_name}/current_job/output/collections", @options)
15
+ end
16
+ end
17
+ end
18
+ end
19
+
@@ -0,0 +1,67 @@
1
+ module Datahen
2
+ module Client
3
+ class ScraperJobPage < Datahen::Client::Base
4
+ def find(scraper_name, gid)
5
+ self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}", @options)
6
+ end
7
+
8
+ def all(scraper_name, opts={})
9
+ params = @options.merge(opts)
10
+ self.class.get("/scrapers/#{scraper_name}/current_job/pages", params)
11
+ end
12
+
13
+ def update(scraper_name, gid, opts={})
14
+ body = {}
15
+ body[:page_type] = opts[:page_type] if opts[:page_type]
16
+ body[:priority] = opts[:priority] if opts[:priority]
17
+ body[:vars] = opts[:vars] if opts[:vars]
18
+
19
+ params = @options.merge({body: body.to_json})
20
+
21
+ self.class.put("/scrapers/#{scraper_name}/current_job/pages/#{gid}", params)
22
+ end
23
+
24
+ def refetch(scraper_name, opts={})
25
+ params = @options.merge(opts)
26
+ self.class.put("/scrapers/#{scraper_name}/current_job/pages/refetch", params)
27
+ end
28
+
29
+ def refetch_by_job(job_id, opts={})
30
+ params = @options.merge(opts)
31
+ self.class.put("/jobs/#{job_id}/pages/refetch", params)
32
+ end
33
+
34
+ def reparse(scraper_name, opts={})
35
+ params = @options.merge(opts)
36
+ self.class.put("/scrapers/#{scraper_name}/current_job/pages/reparse", params)
37
+ end
38
+
39
+ def reparse_by_job(job_id, opts={})
40
+ params = @options.merge(opts)
41
+ self.class.put("/jobs/#{job_id}/pages/reparse", params)
42
+ end
43
+
44
+ def enqueue(scraper_name, method, url, opts={})
45
+ body = {}
46
+ body[:method] = method != "" ? method : "GET"
47
+ body[:url] = url
48
+ body[:page_type] = opts[:page_type] if opts[:page_type]
49
+ body[:priority] = opts[:priority] if opts[:priority]
50
+ body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
51
+ body[:body] = opts[:body] if opts[:body]
52
+ body[:headers] = opts[:headers] if opts[:headers]
53
+ body[:vars] = opts[:vars] if opts[:vars]
54
+ body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
55
+ body[:freshness] = opts[:freshness] if opts[:freshness]
56
+ body[:ua_type] = opts[:ua_type] if opts[:ua_type]
57
+ body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
58
+ body[:cookie] = opts[:cookie] if opts[:cookie]
59
+
60
+ params = @options.merge({body: body.to_json})
61
+
62
+ self.class.post("/scrapers/#{scraper_name}/current_job/pages", params)
63
+ end
64
+
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,28 @@
1
+ module Datahen
2
+ module Client
3
+ class ScraperJobVar < Datahen::Client::Base
4
+
5
+ def find(scraper_name, var_name)
6
+ self.class.get("/scrapers/#{scraper_name}/current_job/vars/#{var_name}", @options)
7
+ end
8
+
9
+ def all(scraper_name, opts={})
10
+ params = @options.merge opts
11
+ self.class.get("/scrapers/#{scraper_name}/current_job/vars", params)
12
+ end
13
+
14
+ def set(scraper_name, var_name, value, opts={})
15
+ body = {}
16
+ body[:value] = value
17
+ body[:secret] = opts[:secret] if opts[:secret]
18
+ params = @options.merge({body: body.to_json})
19
+ self.class.put("/scrapers/#{scraper_name}/current_job/vars/#{var_name}", params)
20
+ end
21
+
22
+ def unset(scraper_name, var_name, opts={})
23
+ params = @options.merge(opts)
24
+ self.class.delete("/scrapers/#{scraper_name}/current_job/vars/#{var_name}", params)
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,28 @@
1
+ module Datahen
2
+ module Client
3
+ class ScraperVar < Datahen::Client::Base
4
+
5
+ def find(scraper_name, var_name)
6
+ self.class.get("/scrapers/#{scraper_name}/vars/#{var_name}", @options)
7
+ end
8
+
9
+ def all(scraper_name, opts={})
10
+ params = @options.merge opts
11
+ self.class.get("/scrapers/#{scraper_name}/vars", params)
12
+ end
13
+
14
+ def set(scraper_name, var_name, value, opts={})
15
+ body = {}
16
+ body[:value] = value
17
+ body[:secret] = opts[:secret] if opts[:secret]
18
+ params = @options.merge({body: body.to_json})
19
+ self.class.put("/scrapers/#{scraper_name}/vars/#{var_name}", params)
20
+ end
21
+
22
+ def unset(scraper_name, var_name, opts={})
23
+ params = @options.merge(opts)
24
+ self.class.delete("/scrapers/#{scraper_name}/vars/#{var_name}", params)
25
+ end
26
+ end
27
+ end
28
+ end