datahen 0.10.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (78) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +29 -0
  8. data/Rakefile +22 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/datahen.gemspec +47 -0
  12. data/examples/fetchtest/libraries/hello.rb +9 -0
  13. data/examples/fetchtest/libraries/hello_fail.rb +10 -0
  14. data/examples/fetchtest/parsers/failed.rb +2 -0
  15. data/examples/fetchtest/parsers/find_outputs.rb +18 -0
  16. data/examples/fetchtest/parsers/home.rb +50 -0
  17. data/examples/fetchtest/parsers/nested_fail.rb +3 -0
  18. data/examples/fetchtest/parsers/simple.rb +14 -0
  19. data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
  20. data/examples/fetchtest/seeders/failed.rb +1 -0
  21. data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
  22. data/examples/fetchtest/seeders/seed.rb +28 -0
  23. data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
  24. data/exe/hen +3 -0
  25. data/lib/datahen.rb +5 -0
  26. data/lib/datahen/cli.rb +45 -0
  27. data/lib/datahen/cli/env_var.rb +48 -0
  28. data/lib/datahen/cli/finisher.rb +40 -0
  29. data/lib/datahen/cli/global_page.rb +39 -0
  30. data/lib/datahen/cli/job.rb +30 -0
  31. data/lib/datahen/cli/job_output.rb +69 -0
  32. data/lib/datahen/cli/parser.rb +64 -0
  33. data/lib/datahen/cli/scraper.rb +185 -0
  34. data/lib/datahen/cli/scraper_deployment.rb +24 -0
  35. data/lib/datahen/cli/scraper_export.rb +51 -0
  36. data/lib/datahen/cli/scraper_exporter.rb +40 -0
  37. data/lib/datahen/cli/scraper_finisher.rb +20 -0
  38. data/lib/datahen/cli/scraper_job.rb +75 -0
  39. data/lib/datahen/cli/scraper_job_var.rb +48 -0
  40. data/lib/datahen/cli/scraper_page.rb +203 -0
  41. data/lib/datahen/cli/scraper_var.rb +48 -0
  42. data/lib/datahen/cli/seeder.rb +40 -0
  43. data/lib/datahen/client.rb +29 -0
  44. data/lib/datahen/client/auth_token.rb +50 -0
  45. data/lib/datahen/client/backblaze_content.rb +45 -0
  46. data/lib/datahen/client/base.rb +69 -0
  47. data/lib/datahen/client/deploy_key.rb +21 -0
  48. data/lib/datahen/client/env_var.rb +28 -0
  49. data/lib/datahen/client/export.rb +10 -0
  50. data/lib/datahen/client/global_page.rb +18 -0
  51. data/lib/datahen/client/job.rb +64 -0
  52. data/lib/datahen/client/job_export.rb +10 -0
  53. data/lib/datahen/client/job_log.rb +26 -0
  54. data/lib/datahen/client/job_output.rb +19 -0
  55. data/lib/datahen/client/job_page.rb +58 -0
  56. data/lib/datahen/client/job_stat.rb +16 -0
  57. data/lib/datahen/client/scraper.rb +57 -0
  58. data/lib/datahen/client/scraper_deployment.rb +18 -0
  59. data/lib/datahen/client/scraper_export.rb +22 -0
  60. data/lib/datahen/client/scraper_exporter.rb +14 -0
  61. data/lib/datahen/client/scraper_finisher.rb +16 -0
  62. data/lib/datahen/client/scraper_job.rb +49 -0
  63. data/lib/datahen/client/scraper_job_output.rb +19 -0
  64. data/lib/datahen/client/scraper_job_page.rb +67 -0
  65. data/lib/datahen/client/scraper_job_var.rb +28 -0
  66. data/lib/datahen/client/scraper_var.rb +28 -0
  67. data/lib/datahen/plugin.rb +6 -0
  68. data/lib/datahen/plugin/context_exposer.rb +55 -0
  69. data/lib/datahen/scraper.rb +18 -0
  70. data/lib/datahen/scraper/executor.rb +373 -0
  71. data/lib/datahen/scraper/finisher.rb +18 -0
  72. data/lib/datahen/scraper/parser.rb +18 -0
  73. data/lib/datahen/scraper/ruby_finisher_executor.rb +116 -0
  74. data/lib/datahen/scraper/ruby_parser_executor.rb +200 -0
  75. data/lib/datahen/scraper/ruby_seeder_executor.rb +120 -0
  76. data/lib/datahen/scraper/seeder.rb +18 -0
  77. data/lib/datahen/version.rb +3 -0
  78. metadata +270 -0
@@ -0,0 +1,48 @@
1
+ module Datahen
2
+ class CLI < Thor
3
+ class ScraperVar < Thor
4
+
5
+ package_name "scraper var"
6
+ def self.banner(command, namespace = nil, subcommand = false)
7
+ "#{basename} #{@package_name} #{command.usage}"
8
+ end
9
+
10
+ desc "list <scraper_name>", "List environment variables on the scraper"
11
+ long_desc <<-LONGDESC
12
+ List all environment variables on the scraper.
13
+ LONGDESC
14
+ option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
15
+ option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
16
+ def list(scraper_name)
17
+ client = Client::ScraperVar.new(options)
18
+ puts "#{client.all(scraper_name)}"
19
+ end
20
+
21
+ desc "set <scraper_name> <var_name> <value>", "Set an environment var on the scraper"
22
+ long_desc <<-LONGDESC
23
+ Creates an environment variable\x5
24
+ <var_name>: Var name can only consist of alphabets, numbers, underscores. Name must be unique to your scraper, otherwise it will be overwritten.\x5
25
+ <value>: Value of variable.\x5
26
+ LONGDESC
27
+ option :secret, type: :boolean, desc: 'Set true to make it decrypt the value. Default: false'
28
+ def set(scraper_name, var_name, value)
29
+ # puts "options #{options}"
30
+ client = Client::ScraperVar.new(options)
31
+ puts "#{client.set(scraper_name, var_name, value, options)}"
32
+ end
33
+
34
+ desc "show <scraper_name> <var_name>", "Show an environment variable on the scraper"
35
+ def show(scraper_name, var_name)
36
+ client = Client::ScraperVar.new(options)
37
+ puts "#{client.find(scraper_name, var_name)}"
38
+ end
39
+
40
+ desc "unset <scraper_name> <var_name>", "Deletes an environment variable on the scraper"
41
+ def unset(scraper_name, var_name)
42
+ client = Client::ScraperVar.new(options)
43
+ puts "#{client.unset(scraper_name, var_name)}"
44
+ end
45
+ end
46
+ end
47
+
48
+ end
@@ -0,0 +1,40 @@
1
+ module Datahen
2
+ class CLI < Thor
3
+ class Seeder < Thor
4
+ desc "try <scraper_name> <seeder_file>", "Tries a seeder file"
5
+ long_desc <<-LONGDESC
6
+ Takes a seeder script and tries to execute it without saving anything.\x5
7
+ <seeder_file>: Seeder script file will be executed.\x5
8
+ LONGDESC
9
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
10
+ def try_seed(scraper_name, seeder_file)
11
+ if options[:job]
12
+ job_id = options[:job]
13
+ else
14
+ job = Client::ScraperJob.new(options).find(scraper_name)
15
+ job_id = job['id']
16
+ end
17
+
18
+ puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, false)
19
+ end
20
+
21
+ desc "exec <scraper_name> <seeder_file>", "Executes a seeder script onto a scraper's current job."
22
+ long_desc <<-LONGDESC
23
+ Takes a seeder script and execute it against a job and enqueues the pages into the scraper's current job\x5
24
+ <seeder_file>: Seeder script file that will be executed on the scraper's current job.\x5
25
+ LONGDESC
26
+ option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
27
+ def exec_parse(scraper_name, seeder_file)
28
+ if options[:job]
29
+ job_id = options[:job]
30
+ else
31
+ job = Client::ScraperJob.new(options).find(scraper_name)
32
+ job_id = job['id']
33
+ end
34
+
35
+ puts Datahen::Scraper::Seeder.exec_seeder(seeder_file, job_id, true)
36
+ end
37
+ end
38
+ end
39
+
40
+ end
@@ -0,0 +1,29 @@
1
+ require "datahen/client/base"
2
+ require "datahen/client/auth_token"
3
+ require "datahen/client/deploy_key"
4
+ require 'datahen/client/export'
5
+ require "datahen/client/scraper"
6
+ require "datahen/client/scraper_deployment"
7
+ require "datahen/client/scraper_job_output"
8
+ require "datahen/client/scraper_job_page"
9
+ require "datahen/client/scraper_exporter"
10
+ require "datahen/client/scraper_export"
11
+ require "datahen/client/scraper_job"
12
+ require "datahen/client/scraper_finisher"
13
+ require 'datahen/client/job_export'
14
+ require "datahen/client/job"
15
+ require "datahen/client/job_log"
16
+ require "datahen/client/global_page"
17
+ require "datahen/client/job_page"
18
+ require "datahen/client/job_output"
19
+ require "datahen/client/job_stat"
20
+ require "datahen/client/backblaze_content"
21
+ require "datahen/client/env_var"
22
+ require "datahen/client/scraper_var"
23
+ require "datahen/client/scraper_job_var"
24
+
25
+
26
+ module Datahen
27
+ module Client
28
+ end
29
+ end
@@ -0,0 +1,50 @@
1
+ module Datahen
2
+ module Client
3
+ class AuthToken < Datahen::Client::Base
4
+
5
+ def find(token)
6
+ self.class.get("/auth_tokens/#{token}", @options)
7
+ end
8
+
9
+ def all(opts={})
10
+ params = @options.merge(opts)
11
+ self.class.get("/auth_tokens", params)
12
+ end
13
+
14
+ def create(role, description, opts={})
15
+ body = {
16
+ role: role,
17
+ description: description}
18
+
19
+ params = @options.merge({body: body.to_json})
20
+ self.class.post("/auth_tokens", params)
21
+ end
22
+
23
+ def create_on_account(account_id, role, description)
24
+ body = {
25
+ role: role,
26
+ description: description}
27
+
28
+ params = @options.merge({body: body.to_json})
29
+ self.class.post("/accounts/#{account_id}/auth_tokens", params)
30
+ end
31
+
32
+ def update(token, role, description="", opts={})
33
+ body = {}
34
+
35
+ body[:role] = role
36
+ body[:description] = description if description.present?
37
+ params = @options.merge({body: body.to_json})
38
+
39
+ self.class.put("/auth_tokens/#{token}", params)
40
+ end
41
+
42
+ def delete(token, opts={})
43
+ body = {}
44
+ params = @options.merge({body: body.to_json})
45
+
46
+ self.class.delete("/auth_tokens/#{token}", params)
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,45 @@
1
+ require 'zlib'
2
+ require 'httparty'
3
+
4
+ module Datahen
5
+ module Client
6
+ class BackblazeContent
7
+ include HTTParty
8
+
9
+ def get_content(url)
10
+ self.class.get(url, format: :plain)
11
+ end
12
+
13
+ def get_gunzipped_content(url)
14
+ # Zlib.gunzip(get_content(url))
15
+ gunzip(get_content(url))
16
+ end
17
+
18
+ def gunzip(string)
19
+ sio = StringIO.new(string)
20
+ gz = Zlib::GzipReader.new(sio, encoding: Encoding::ASCII_8BIT)
21
+ _content = ""
22
+ begin
23
+ _content = gz.read
24
+ rescue => e
25
+ # if unexpected eof error, then readchar until error, and ignore it
26
+ if e.to_s == 'unexpected end of file'
27
+ begin
28
+ while !gz.eof?
29
+ _content += gz.readchar
30
+ end
31
+ rescue => e
32
+ puts "Ignored Zlib error: #{e.to_s}"
33
+ end
34
+ else
35
+ raise e
36
+ end
37
+ end
38
+
39
+ return _content
40
+ ensure
41
+ gz.close if gz.respond_to?(:close)
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,69 @@
1
+ require 'httparty'
2
+
3
+ module Datahen
4
+ module Client
5
+ class Base
6
+ include HTTParty
7
+
8
+ def self.env_auth_token
9
+ ENV['DATAHEN_TOKEN']
10
+ end
11
+
12
+ def self.env_ignore_ssl
13
+ ENV['DATAHEN_IGNORE_SSL'].to_s.strip == '1'
14
+ end
15
+
16
+ def env_api_url
17
+ ENV['DATAHEN_API_URL'].nil? ? 'https://app.datahen.com/api/v1' : ENV['DATAHEN_API_URL']
18
+ end
19
+
20
+ def ignore_ssl
21
+ return @ignore_ssl unless @ignore_ssl.nil?
22
+ @ignore_ssl = self.class.env_ignore_ssl
23
+ @ignore_ssl
24
+ end
25
+
26
+ def auth_token
27
+ @auth_token ||= self.class.env_auth_token
28
+ end
29
+
30
+ def auth_token= value
31
+ @auth_token = value
32
+ end
33
+
34
+ def initialize(opts={})
35
+ @ignore_ssl = opts[:ignore_ssl]
36
+ self.class.base_uri(env_api_url)
37
+ self.auth_token = opts[:auth_token] unless opts[:auth_token].nil?
38
+ @options = {
39
+ headers: {
40
+ "Authorization" => "Bearer #{auth_token}",
41
+ "Content-Type" => "application/json",
42
+ },
43
+ verify: !ignore_ssl
44
+ }
45
+
46
+ query = {}
47
+ query[:p] = opts[:page] if opts[:page]
48
+ query[:pp] = opts[:per_page] if opts[:per_page]
49
+ query[:fetchfail] = opts[:fetch_fail] if opts[:fetch_fail]
50
+ query[:parsefail] = opts[:parse_fail] if opts[:parse_fail]
51
+ query[:status] = opts[:status] if opts[:status]
52
+ query[:page_type] = opts[:page_type] if opts[:page_type]
53
+ query[:gid] = opts[:gid] if opts[:gid]
54
+
55
+ if opts[:query]
56
+ if opts[:query].is_a?(Hash)
57
+ query[:q] = opts[:query].to_json
58
+ elsif opts[:query].is_a?(String)
59
+ query[:q] = JSON.parse(opts[:query]).to_json
60
+ end
61
+ end
62
+
63
+ unless query.empty?
64
+ @options.merge!(query: query)
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,21 @@
1
+ module Datahen
2
+ module Client
3
+ class DeployKey < Datahen::Client::Base
4
+
5
+ def find(opts={})
6
+ params = @options.merge(opts)
7
+ self.class.get("/deploy_key", params)
8
+ end
9
+
10
+ def create(opts={})
11
+ params = @options.merge(opts)
12
+ self.class.post("/deploy_key", params)
13
+ end
14
+
15
+ def delete(opts={})
16
+ params = @options.merge(opts)
17
+ self.class.delete("/deploy_key", params)
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,28 @@
1
+ module Datahen
2
+ module Client
3
+ class EnvVar < Datahen::Client::Base
4
+
5
+ def find(name)
6
+ self.class.get("/env_vars/#{name}", @options)
7
+ end
8
+
9
+ def all(opts={})
10
+ params = @options.merge opts
11
+ self.class.get("/env_vars", params)
12
+ end
13
+
14
+ def set(name, value, opts={})
15
+ body = {}
16
+ body[:value] = value
17
+ body[:secret] = opts[:secret] if opts[:secret]
18
+ params = @options.merge({body: body.to_json})
19
+ self.class.put("/env_vars/#{name}", params)
20
+ end
21
+
22
+ def unset(name, opts={})
23
+ params = @options.merge(opts)
24
+ self.class.delete("/env_vars/#{name}", params)
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,10 @@
1
+ module Datahen
2
+ module Client
3
+ class Export < Datahen::Client::Base
4
+ def all(opts={})
5
+ params = @options.merge(opts)
6
+ self.class.get("/scrapers/exports", params)
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,18 @@
1
+ module Datahen
2
+ module Client
3
+ class GlobalPage < Datahen::Client::Base
4
+ def find(gid)
5
+ self.class.get("/global_pages/#{gid}", @options)
6
+ end
7
+
8
+ def find_content(gid)
9
+ self.class.get("/global_pages/#{gid}/content", @options)
10
+ end
11
+
12
+ def find_failed_content(gid)
13
+ self.class.get("/global_pages/#{gid}/failed_content", @options)
14
+ end
15
+ end
16
+ end
17
+ end
18
+
@@ -0,0 +1,64 @@
1
+ module Datahen
2
+ module Client
3
+ class Job < Datahen::Client::Base
4
+ def all(opts={})
5
+ params = @options.merge(opts)
6
+ self.class.get("/jobs", params)
7
+ end
8
+
9
+ def find(job_id)
10
+ self.class.get("/jobs/#{job_id}", @options)
11
+ end
12
+
13
+ def update(job_id, opts={})
14
+ body = {}
15
+ body[:status] = opts[:status] if opts[:status]
16
+ body[:standard_worker_count] = opts[:workers] if opts[:workers]
17
+ body[:browser_worker_count] = opts[:browsers] if opts[:browsers]
18
+ params = @options.merge({body: body.to_json})
19
+
20
+ self.class.put("/jobs/#{job_id}", params)
21
+ end
22
+
23
+ def cancel(job_id, opts={})
24
+ opts[:status] = 'cancelled'
25
+ update(job_id, opts)
26
+ end
27
+
28
+ def resume(job_id, opts={})
29
+ opts[:status] = 'active'
30
+ update(job_id, opts)
31
+ end
32
+
33
+ def pause(job_id, opts={})
34
+ opts[:status] = 'paused'
35
+ update(job_id, opts)
36
+ end
37
+
38
+ def seeding_update(job_id, opts={})
39
+ body = {}
40
+ body[:outputs] = opts.fetch(:outputs) {[]}
41
+ body[:pages] = opts.fetch(:pages) {[]}
42
+ body[:seeding_status] = opts.fetch(:seeding_status){ nil }
43
+ body[:log_error] = opts[:log_error] if opts[:log_error]
44
+
45
+ params = @options.merge({body: body.to_json})
46
+
47
+ self.class.put("/jobs/#{job_id}/seeding_update", params)
48
+ end
49
+
50
+ def finisher_update(job_id, opts={})
51
+ body = {}
52
+ body[:outputs] = opts.fetch(:outputs) {[]}
53
+ body[:finisher_status] = opts.fetch(:finisher_status){ nil }
54
+ body[:log_error] = opts[:log_error] if opts[:log_error]
55
+
56
+ params = @options.merge({body: body.to_json})
57
+
58
+ self.class.put("/jobs/#{job_id}/finisher_update", params)
59
+ end
60
+
61
+ end
62
+
63
+ end
64
+ end
@@ -0,0 +1,10 @@
1
+ module Datahen
2
+ module Client
3
+ class JobExport < Datahen::Client::Base
4
+ def create(job_id, exporter_name)
5
+ self.class.post("/jobs/#{job_id}/exports/#{exporter_name}", @options)
6
+ end
7
+ end
8
+ end
9
+ end
10
+