datahen 0.10.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +29 -0
  8. data/Rakefile +22 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/datahen.gemspec +47 -0
  12. data/examples/fetchtest/libraries/hello.rb +9 -0
  13. data/examples/fetchtest/libraries/hello_fail.rb +10 -0
  14. data/examples/fetchtest/parsers/failed.rb +2 -0
  15. data/examples/fetchtest/parsers/find_outputs.rb +18 -0
  16. data/examples/fetchtest/parsers/home.rb +50 -0
  17. data/examples/fetchtest/parsers/nested_fail.rb +3 -0
  18. data/examples/fetchtest/parsers/simple.rb +14 -0
  19. data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
  20. data/examples/fetchtest/seeders/failed.rb +1 -0
  21. data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
  22. data/examples/fetchtest/seeders/seed.rb +28 -0
  23. data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
  24. data/exe/hen +3 -0
  25. data/lib/datahen.rb +5 -0
  26. data/lib/datahen/cli.rb +45 -0
  27. data/lib/datahen/cli/env_var.rb +48 -0
  28. data/lib/datahen/cli/finisher.rb +40 -0
  29. data/lib/datahen/cli/global_page.rb +39 -0
  30. data/lib/datahen/cli/job.rb +30 -0
  31. data/lib/datahen/cli/job_output.rb +69 -0
  32. data/lib/datahen/cli/parser.rb +64 -0
  33. data/lib/datahen/cli/scraper.rb +185 -0
  34. data/lib/datahen/cli/scraper_deployment.rb +24 -0
  35. data/lib/datahen/cli/scraper_export.rb +51 -0
  36. data/lib/datahen/cli/scraper_exporter.rb +40 -0
  37. data/lib/datahen/cli/scraper_finisher.rb +20 -0
  38. data/lib/datahen/cli/scraper_job.rb +75 -0
  39. data/lib/datahen/cli/scraper_job_var.rb +48 -0
  40. data/lib/datahen/cli/scraper_page.rb +203 -0
  41. data/lib/datahen/cli/scraper_var.rb +48 -0
  42. data/lib/datahen/cli/seeder.rb +40 -0
  43. data/lib/datahen/client.rb +29 -0
  44. data/lib/datahen/client/auth_token.rb +50 -0
  45. data/lib/datahen/client/backblaze_content.rb +45 -0
  46. data/lib/datahen/client/base.rb +69 -0
  47. data/lib/datahen/client/deploy_key.rb +21 -0
  48. data/lib/datahen/client/env_var.rb +28 -0
  49. data/lib/datahen/client/export.rb +10 -0
  50. data/lib/datahen/client/global_page.rb +18 -0
  51. data/lib/datahen/client/job.rb +64 -0
  52. data/lib/datahen/client/job_export.rb +10 -0
  53. data/lib/datahen/client/job_log.rb +26 -0
  54. data/lib/datahen/client/job_output.rb +19 -0
  55. data/lib/datahen/client/job_page.rb +58 -0
  56. data/lib/datahen/client/job_stat.rb +16 -0
  57. data/lib/datahen/client/scraper.rb +57 -0
  58. data/lib/datahen/client/scraper_deployment.rb +18 -0
  59. data/lib/datahen/client/scraper_export.rb +22 -0
  60. data/lib/datahen/client/scraper_exporter.rb +14 -0
  61. data/lib/datahen/client/scraper_finisher.rb +16 -0
  62. data/lib/datahen/client/scraper_job.rb +49 -0
  63. data/lib/datahen/client/scraper_job_output.rb +19 -0
  64. data/lib/datahen/client/scraper_job_page.rb +67 -0
  65. data/lib/datahen/client/scraper_job_var.rb +28 -0
  66. data/lib/datahen/client/scraper_var.rb +28 -0
  67. data/lib/datahen/plugin.rb +6 -0
  68. data/lib/datahen/plugin/context_exposer.rb +55 -0
  69. data/lib/datahen/scraper.rb +18 -0
  70. data/lib/datahen/scraper/executor.rb +373 -0
  71. data/lib/datahen/scraper/finisher.rb +18 -0
  72. data/lib/datahen/scraper/parser.rb +18 -0
  73. data/lib/datahen/scraper/ruby_finisher_executor.rb +116 -0
  74. data/lib/datahen/scraper/ruby_parser_executor.rb +200 -0
  75. data/lib/datahen/scraper/ruby_seeder_executor.rb +120 -0
  76. data/lib/datahen/scraper/seeder.rb +18 -0
  77. data/lib/datahen/version.rb +3 -0
  78. metadata +270 -0
@@ -0,0 +1,48 @@
1
+ module Datahen
2
+ class CLI < Thor
3
+ class ScraperVar < Thor
4
+
5
+ package_name "scraper var"
6
+ def self.banner(command, namespace = nil, subcommand = false)
7
+ "#{basename} #{@package_name} #{command.usage}"
8
+ end
9
+
10
+ desc "list <scraper_name>", "List environment variables on the scraper"
11
+ long_desc <<-LONGDESC
12
+ List all environment variables on the scraper.
13
+ LONGDESC
14
+ option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
15
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
16
+ def list(scraper_name)
17
+ client = Client::ScraperVar.new(options)
18
+ puts "#{client.all(scraper_name)}"
19
+ end
20
+
21
+ desc "set <scraper_name> <var_name> <value>", "Set an environment var on the scraper"
22
+ long_desc <<-LONGDESC
23
+ Creates an environment variable\x5
24
+ <var_name>: Var name can only consist of alphabets, numbers, underscores. Name must be unique to your scraper, otherwise it will be overwritten.\x5
25
+ <value>: Value of variable.\x5
26
+ LONGDESC
27
+ option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
28
+ def set(scraper_name, var_name, value)
29
+ # puts "options #{options}"
30
+ client = Client::ScraperVar.new(options)
31
+ puts "#{client.set(scraper_name, var_name, value, options)}"
32
+ end
33
+
34
+ desc "show <scraper_name> <var_name>", "Show an environment variable on the scraper"
35
+ def show(scraper_name, var_name)
36
+ client = Client::ScraperVar.new(options)
37
+ puts "#{client.find(scraper_name, var_name)}"
38
+ end
39
+
40
+ desc "unset <scraper_name> <var_name>", "Deletes an environment variable on the scraper"
41
+ def unset(scraper_name, var_name)
42
+ client = Client::ScraperVar.new(options)
43
+ puts "#{client.unset(scraper_name, var_name)}"
44
+ end
45
+ end
46
+ end
47
+
48
+ end
@@ -0,0 +1,40 @@
1
+ module Datahen
2
+ class CLI < Thor
3
+ class Seeder < Thor
4
+ desc "try <scraper_name> <seeder_file>", "Tries a seeder file"
5
+ long_desc <<-LONGDESC
6
+ Takes a seeder script and tries to execute it without saving anything.\x5
7
+ <seeder_file>: Seeder script file will be executed.\x5
8
+ LONGDESC
9
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
10
+ def try_seed(scraper_name, seeder_file)
11
+ if options[:job]
12
+ job_id = options[:job]
13
+ else
14
+ job = Client::ScraperJob.new(options).find(scraper_name)
15
+ job_id = job['id']
16
+ end
17
+
18
+ puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false)
19
+ end
20
+
21
+ desc "exec <scraper_name> <seeder_file>", "Executes a seeder script onto a scraper's current job."
22
+ long_desc <<-LONGDESC
23
+ Takes a seeder script and execute it against a job and enqueues the pages into the scraper's current job\x5
24
+ <seeder_file>: Seeder script file that will be executed on the scraper's current job.\x5
25
+ LONGDESC
26
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
27
+ def exec_parse(scraper_name, seeder_file)
28
+ if options[:job]
29
+ job_id = options[:job]
30
+ else
31
+ job = Client::ScraperJob.new(options).find(scraper_name)
32
+ job_id = job['id']
33
+ end
34
+
35
+ puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, true)
36
+ end
37
+ end
38
+ end
39
+
40
+ end
@@ -0,0 +1,29 @@
1
+ require "datahen/client/base"
2
+ require "datahen/client/auth_token"
3
+ require "datahen/client/deploy_key"
4
+ require 'datahen/client/export'
5
+ require "datahen/client/scraper"
6
+ require "datahen/client/scraper_deployment"
7
+ require "datahen/client/scraper_job_output"
8
+ require "datahen/client/scraper_job_page"
9
+ require "datahen/client/scraper_exporter"
10
+ require "datahen/client/scraper_export"
11
+ require "datahen/client/scraper_job"
12
+ require "datahen/client/scraper_finisher"
13
+ require 'datahen/client/job_export'
14
+ require "datahen/client/job"
15
+ require "datahen/client/job_log"
16
+ require "datahen/client/global_page"
17
+ require "datahen/client/job_page"
18
+ require "datahen/client/job_output"
19
+ require "datahen/client/job_stat"
20
+ require "datahen/client/backblaze_content"
21
+ require "datahen/client/env_var"
22
+ require "datahen/client/scraper_var"
23
+ require "datahen/client/scraper_job_var"
24
+
25
+
26
+ module Datahen
27
+ module Client
28
+ end
29
+ end
@@ -0,0 +1,50 @@
1
+ module Datahen
2
+ module Client
3
+ class AuthToken < Datahen::Client::Base
4
+
5
+ def find(token)
6
+ self.class.get("/auth_tokens/#{token}", @options)
7
+ end
8
+
9
+ def all(opts={})
10
+ params = @options.merge(opts)
11
+ self.class.get("/auth_tokens", params)
12
+ end
13
+
14
+ def create(role, description, opts={})
15
+ body = {
16
+ role: role,
17
+ description: description}
18
+
19
+ params = @options.merge({body: body.to_json})
20
+ self.class.post("/auth_tokens", params)
21
+ end
22
+
23
+ def create_on_account(account_id, role, description)
24
+ body = {
25
+ role: role,
26
+ description: description}
27
+
28
+ params = @options.merge({body: body.to_json})
29
+ self.class.post("/accounts/#{account_id}/auth_tokens", params)
30
+ end
31
+
32
+ def update(token, role, description="", opts={})
33
+ body = {}
34
+
35
+ body[:role] = role
36
+ body[:description] = description if description.present?
37
+ params = @options.merge({body: body.to_json})
38
+
39
+ self.class.put("/auth_tokens/#{token}", params)
40
+ end
41
+
42
+ def delete(token, opts={})
43
+ body = {}
44
+ params = @options.merge({body: body.to_json})
45
+
46
+ self.class.delete("/auth_tokens/#{token}", params)
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,45 @@
1
+ require 'zlib'
2
+ require 'httparty'
3
+
4
+ module Datahen
5
+ module Client
6
+ class BackblazeContent
7
+ include HTTParty
8
+
9
+ def get_content(url)
10
+ self.class.get(url, format: :plain)
11
+ end
12
+
13
+ def get_gunzipped_content(url)
14
+ # Zlib.gunzip(get_content(url))
15
+ gunzip(get_content(url))
16
+ end
17
+
18
+ def gunzip(string)
19
+ sio = StringIO.new(string)
20
+ gz = Zlib::GzipReader.new(sio, encoding: Encoding::ASCII_8BIT)
21
+ _content = ""
22
+ begin
23
+ _content = gz.read
24
+ rescue => e
25
+ # if unexpected eof error, then readchar until error, and ignore it
26
+ if e.to_s == 'unexpected end of file'
27
+ begin
28
+ while !gz.eof?
29
+ _content += gz.readchar
30
+ end
31
+ rescue => e
32
+ puts "Ignored Zlib error: #{e.to_s}"
33
+ end
34
+ else
35
+ raise e
36
+ end
37
+ end
38
+
39
+ return _content
40
+ ensure
41
+ gz.close if gz.respond_to?(:close)
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,69 @@
1
+ require 'httparty'
2
+
3
+ module Datahen
4
+ module Client
5
+ class Base
6
+ include HTTParty
7
+
8
+ def self.env_auth_token
9
+ ENV['DATAHEN_TOKEN']
10
+ end
11
+
12
+ def self.env_ignore_ssl
13
+ ENV['DATAHEN_IGNORE_SSL'].to_s.strip == '1'
14
+ end
15
+
16
+ def env_api_url
17
+ ENV['DATAHEN_API_URL'].nil? ? 'https://app.datahen.com/api/v1' : ENV['DATAHEN_API_URL']
18
+ end
19
+
20
+ def ignore_ssl
21
+ return @ignore_ssl unless @ignore_ssl.nil?
22
+ @ignore_ssl = self.class.env_ignore_ssl
23
+ @ignore_ssl
24
+ end
25
+
26
+ def auth_token
27
+ @auth_token ||= self.class.env_auth_token
28
+ end
29
+
30
+ def auth_token= value
31
+ @auth_token = value
32
+ end
33
+
34
+ def initialize(opts={})
35
+ @ignore_ssl = opts[:ignore_ssl]
36
+ self.class.base_uri(env_api_url)
37
+ self.auth_token = opts[:auth_token] unless opts[:auth_token].nil?
38
+ @options = {
39
+ headers: {
40
+ "Authorization" => "Bearer #{auth_token}",
41
+ "Content-Type" => "application/json",
42
+ },
43
+ verify: !ignore_ssl
44
+ }
45
+
46
+ query = {}
47
+ query[:p] = opts[:page] if opts[:page]
48
+ query[:pp] = opts[:per_page] if opts[:per_page]
49
+ query[:fetchfail] = opts[:fetch_fail] if opts[:fetch_fail]
50
+ query[:parsefail] = opts[:parse_fail] if opts[:parse_fail]
51
+ query[:status] = opts[:status] if opts[:status]
52
+ query[:page_type] = opts[:page_type] if opts[:page_type]
53
+ query[:gid] = opts[:gid] if opts[:gid]
54
+
55
+ if opts[:query]
56
+ if opts[:query].is_a?(Hash)
57
+ query[:q] = opts[:query].to_json
58
+ elsif opts[:query].is_a?(String)
59
+ query[:q] = JSON.parse(opts[:query]).to_json
60
+ end
61
+ end
62
+
63
+ unless query.empty?
64
+ @options.merge!(query: query)
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,21 @@
1
+ module Datahen
2
+ module Client
3
+ class DeployKey < Datahen::Client::Base
4
+
5
+ def find(opts={})
6
+ params = @options.merge(opts)
7
+ self.class.get("/deploy_key", params)
8
+ end
9
+
10
+ def create(opts={})
11
+ params = @options.merge(opts)
12
+ self.class.post("/deploy_key", params)
13
+ end
14
+
15
+ def delete(opts={})
16
+ params = @options.merge(opts)
17
+ self.class.delete("/deploy_key", params)
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,28 @@
1
+ module Datahen
2
+ module Client
3
+ class EnvVar < Datahen::Client::Base
4
+
5
+ def find(name)
6
+ self.class.get("/env_vars/#{name}", @options)
7
+ end
8
+
9
+ def all(opts={})
10
+ params = @options.merge opts
11
+ self.class.get("/env_vars", params)
12
+ end
13
+
14
+ def set(name, value, opts={})
15
+ body = {}
16
+ body[:value] = value
17
+ body[:secret] = opts[:secret] if opts[:secret]
18
+ params = @options.merge({body: body.to_json})
19
+ self.class.put("/env_vars/#{name}", params)
20
+ end
21
+
22
+ def unset(name, opts={})
23
+ params = @options.merge(opts)
24
+ self.class.delete("/env_vars/#{name}", params)
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,10 @@
1
+ module Datahen
2
+ module Client
3
+ class Export < Datahen::Client::Base
4
+ def all(opts={})
5
+ params = @options.merge(opts)
6
+ self.class.get("/scrapers/exports", params)
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,18 @@
1
+ module Datahen
2
+ module Client
3
+ class GlobalPage < Datahen::Client::Base
4
+ def find(gid)
5
+ self.class.get("/global_pages/#{gid}", @options)
6
+ end
7
+
8
+ def find_content(gid)
9
+ self.class.get("/global_pages/#{gid}/content", @options)
10
+ end
11
+
12
+ def find_failed_content(gid)
13
+ self.class.get("/global_pages/#{gid}/failed_content", @options)
14
+ end
15
+ end
16
+ end
17
+ end
18
+
@@ -0,0 +1,64 @@
1
+ module Datahen
2
+ module Client
3
+ class Job < Datahen::Client::Base
4
+ def all(opts={})
5
+ params = @options.merge(opts)
6
+ self.class.get("/jobs", params)
7
+ end
8
+
9
+ def find(job_id)
10
+ self.class.get("/jobs/#{job_id}", @options)
11
+ end
12
+
13
+ def update(job_id, opts={})
14
+ body = {}
15
+ body[:status] = opts[:status] if opts[:status]
16
+ body[:standard_worker_count] = opts[:workers] if opts[:workers]
17
+ body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
18
+ params = @options.merge({body: body.to_json})
19
+
20
+ self.class.put("/jobs/#{job_id}", params)
21
+ end
22
+
23
+ def cancel(job_id, opts={})
24
+ opts[:status] = 'cancelled'
25
+ update(job_id, opts)
26
+ end
27
+
28
+ def resume(job_id, opts={})
29
+ opts[:status] = 'active'
30
+ update(job_id, opts)
31
+ end
32
+
33
+ def pause(job_id, opts={})
34
+ opts[:status] = 'paused'
35
+ update(job_id, opts)
36
+ end
37
+
38
+ def seeding_update(job_id, opts={})
39
+ body = {}
40
+ body[:outputs] = opts.fetch(:outputs) {[]}
41
+ body[:pages] = opts.fetch(:pages) {[]}
42
+ body[:seeding_status] = opts.fetch(:seeding_status){ nil }
43
+ body[:log_error] = opts[:log_error] if opts[:log_error]
44
+
45
+ params = @options.merge({body: body.to_json})
46
+
47
+ self.class.put("/jobs/#{job_id}/seeding_update", params)
48
+ end
49
+
50
+ def finisher_update(job_id, opts={})
51
+ body = {}
52
+ body[:outputs] = opts.fetch(:outputs) {[]}
53
+ body[:finisher_status] = opts.fetch(:finisher_status){ nil }
54
+ body[:log_error] = opts[:log_error] if opts[:log_error]
55
+
56
+ params = @options.merge({body: body.to_json})
57
+
58
+ self.class.put("/jobs/#{job_id}/finisher_update", params)
59
+ end
60
+
61
+ end
62
+
63
+ end
64
+ end
@@ -0,0 +1,10 @@
1
+ module Datahen
2
+ module Client
3
+ class JobExport < Datahen::Client::Base
4
+ def create(job_id, exporter_name)
5
+ self.class.post("/jobs/#{job_id}/exports/#{exporter_name}", @options)
6
+ end
7
+ end
8
+ end
9
+ end
10
+