datahen 0.10.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (78) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +29 -0
  8. data/Rakefile +22 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/datahen.gemspec +47 -0
  12. data/examples/fetchtest/libraries/hello.rb +9 -0
  13. data/examples/fetchtest/libraries/hello_fail.rb +10 -0
  14. data/examples/fetchtest/parsers/failed.rb +2 -0
  15. data/examples/fetchtest/parsers/find_outputs.rb +18 -0
  16. data/examples/fetchtest/parsers/home.rb +50 -0
  17. data/examples/fetchtest/parsers/nested_fail.rb +3 -0
  18. data/examples/fetchtest/parsers/simple.rb +14 -0
  19. data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
  20. data/examples/fetchtest/seeders/failed.rb +1 -0
  21. data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
  22. data/examples/fetchtest/seeders/seed.rb +28 -0
  23. data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
  24. data/exe/hen +3 -0
  25. data/lib/datahen.rb +5 -0
  26. data/lib/datahen/cli.rb +45 -0
  27. data/lib/datahen/cli/env_var.rb +48 -0
  28. data/lib/datahen/cli/finisher.rb +40 -0
  29. data/lib/datahen/cli/global_page.rb +39 -0
  30. data/lib/datahen/cli/job.rb +30 -0
  31. data/lib/datahen/cli/job_output.rb +69 -0
  32. data/lib/datahen/cli/parser.rb +64 -0
  33. data/lib/datahen/cli/scraper.rb +185 -0
  34. data/lib/datahen/cli/scraper_deployment.rb +24 -0
  35. data/lib/datahen/cli/scraper_export.rb +51 -0
  36. data/lib/datahen/cli/scraper_exporter.rb +40 -0
  37. data/lib/datahen/cli/scraper_finisher.rb +20 -0
  38. data/lib/datahen/cli/scraper_job.rb +75 -0
  39. data/lib/datahen/cli/scraper_job_var.rb +48 -0
  40. data/lib/datahen/cli/scraper_page.rb +203 -0
  41. data/lib/datahen/cli/scraper_var.rb +48 -0
  42. data/lib/datahen/cli/seeder.rb +40 -0
  43. data/lib/datahen/client.rb +29 -0
  44. data/lib/datahen/client/auth_token.rb +50 -0
  45. data/lib/datahen/client/backblaze_content.rb +45 -0
  46. data/lib/datahen/client/base.rb +69 -0
  47. data/lib/datahen/client/deploy_key.rb +21 -0
  48. data/lib/datahen/client/env_var.rb +28 -0
  49. data/lib/datahen/client/export.rb +10 -0
  50. data/lib/datahen/client/global_page.rb +18 -0
  51. data/lib/datahen/client/job.rb +64 -0
  52. data/lib/datahen/client/job_export.rb +10 -0
  53. data/lib/datahen/client/job_log.rb +26 -0
  54. data/lib/datahen/client/job_output.rb +19 -0
  55. data/lib/datahen/client/job_page.rb +58 -0
  56. data/lib/datahen/client/job_stat.rb +16 -0
  57. data/lib/datahen/client/scraper.rb +57 -0
  58. data/lib/datahen/client/scraper_deployment.rb +18 -0
  59. data/lib/datahen/client/scraper_export.rb +22 -0
  60. data/lib/datahen/client/scraper_exporter.rb +14 -0
  61. data/lib/datahen/client/scraper_finisher.rb +16 -0
  62. data/lib/datahen/client/scraper_job.rb +49 -0
  63. data/lib/datahen/client/scraper_job_output.rb +19 -0
  64. data/lib/datahen/client/scraper_job_page.rb +67 -0
  65. data/lib/datahen/client/scraper_job_var.rb +28 -0
  66. data/lib/datahen/client/scraper_var.rb +28 -0
  67. data/lib/datahen/plugin.rb +6 -0
  68. data/lib/datahen/plugin/context_exposer.rb +55 -0
  69. data/lib/datahen/scraper.rb +18 -0
  70. data/lib/datahen/scraper/executor.rb +373 -0
  71. data/lib/datahen/scraper/finisher.rb +18 -0
  72. data/lib/datahen/scraper/parser.rb +18 -0
  73. data/lib/datahen/scraper/ruby_finisher_executor.rb +116 -0
  74. data/lib/datahen/scraper/ruby_parser_executor.rb +200 -0
  75. data/lib/datahen/scraper/ruby_seeder_executor.rb +120 -0
  76. data/lib/datahen/scraper/seeder.rb +18 -0
  77. data/lib/datahen/version.rb +3 -0
  78. metadata +270 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 1ed77715ebc1abeb2b1f3e6e7056e5acca934d7b067e52109c91b4885346b83f
4
+ data.tar.gz: ec4179ea4b21e6e22bfb70d5acca4e115aee4461e60f8c31433a56aad9dbe1b5
5
+ SHA512:
6
+ metadata.gz: aa842e66b934d77aff81706574a48c49c106e0e1104406f77dd1d8713e317a4a75ec04d7204e5a07c47bb501b380dd79c5b0c76fe4a961e378f862c67e7e78fc
7
+ data.tar.gz: 372b18780ff931e73d4d7ecdae69506759df1bf8caa69275955ffba1e7d12a4488614f58375488f6cfb3384b9a31d9e37b5767f963f00da445bc1db604c4dcb8
@@ -0,0 +1,12 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
12
+ Gemfile.lock
@@ -0,0 +1,7 @@
1
+ ---
2
+ sudo: false
3
+ language: ruby
4
+ cache: bundler
5
+ rvm:
6
+ - 2.4.2
7
+ before_install: gem install bundler -v 1.16.3
@@ -0,0 +1,74 @@
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ In the interest of fostering an open and welcoming environment, we as
6
+ contributors and maintainers pledge to making participation in our project and
7
+ our community a harassment-free experience for everyone, regardless of age, body
8
+ size, disability, ethnicity, gender identity and expression, level of experience,
9
+ nationality, personal appearance, race, religion, or sexual identity and
10
+ orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to creating a positive environment
15
+ include:
16
+
17
+ * Using welcoming and inclusive language
18
+ * Being respectful of differing viewpoints and experiences
19
+ * Gracefully accepting constructive criticism
20
+ * Focusing on what is best for the community
21
+ * Showing empathy towards other community members
22
+
23
+ Examples of unacceptable behavior by participants include:
24
+
25
+ * The use of sexualized language or imagery and unwelcome sexual attention or
26
+ advances
27
+ * Trolling, insulting/derogatory comments, and personal or political attacks
28
+ * Public or private harassment
29
+ * Publishing others' private information, such as a physical or electronic
30
+ address, without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate in a
32
+ professional setting
33
+
34
+ ## Our Responsibilities
35
+
36
+ Project maintainers are responsible for clarifying the standards of acceptable
37
+ behavior and are expected to take appropriate and fair corrective action in
38
+ response to any instances of unacceptable behavior.
39
+
40
+ Project maintainers have the right and responsibility to remove, edit, or
41
+ reject comments, commits, code, wiki edits, issues, and other contributions
42
+ that are not aligned to this Code of Conduct, or to ban temporarily or
43
+ permanently any contributor for other behaviors that they deem inappropriate,
44
+ threatening, offensive, or harmful.
45
+
46
+ ## Scope
47
+
48
+ This Code of Conduct applies both within project spaces and in public spaces
49
+ when an individual is representing the project or its community. Examples of
50
+ representing a project or community include using an official project e-mail
51
+ address, posting via an official social media account, or acting as an appointed
52
+ representative at an online or offline event. Representation of a project may be
53
+ further defined and clarified by project maintainers.
54
+
55
+ ## Enforcement
56
+
57
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
58
+ reported by contacting the project team at perry@datahen.com. All
59
+ complaints will be reviewed and investigated and will result in a response that
60
+ is deemed necessary and appropriate to the circumstances. The project team is
61
+ obligated to maintain confidentiality with regard to the reporter of an incident.
62
+ Further details of specific enforcement policies may be posted separately.
63
+
64
+ Project maintainers who do not follow or enforce the Code of Conduct in good
65
+ faith may face temporary or permanent repercussions as determined by other
66
+ members of the project's leadership.
67
+
68
+ ## Attribution
69
+
70
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71
+ available at [http://contributor-covenant.org/version/1/4][version]
72
+
73
+ [homepage]: http://contributor-covenant.org
74
+ [version]: http://contributor-covenant.org/version/1/4/
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in datahen.gemspec
6
+ gemspec
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2019 Parama Danoesubroto
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # Datahen
2
+
3
+ Welcome to the DataHen gem, this gem includes the client and the CLI to be able to integrate with datahen.com
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'datahen'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install datahen
20
+
21
+
22
+
23
+ ## Contributing
24
+
25
+ Bug reports and pull requests are welcome on GitHub at https://github.com/DataHenOfficial/datahen. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
26
+
27
+ ## License
28
+
29
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,22 @@
1
+ require 'benchmark'
2
+ require 'bundler/gem_tasks'
3
+ require 'rake/testtask'
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.libs = ['lib', 'test']
7
+ t.warning = false
8
+ t.verbose = false
9
+ t.test_files = FileList['./test/**/*_test.rb']
10
+ end
11
+
12
+ desc 'Benchmark another task execution | usage example: benchmark[my_task, param1, param2]'
13
+ task :benchmark, [:task] do |task, args|
14
+ task_name = args[:task]
15
+ if task_name.nil?
16
+ puts "Should select a task."
17
+ exit 1
18
+ end
19
+ puts Benchmark.measure{ Rake::Task[task_name].invoke *args.extras }
20
+ end
21
+
22
+ task default: :test
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "datahen"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,47 @@
1
+
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "datahen/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "datahen"
8
+ spec.version = Datahen::VERSION
9
+ spec.authors = ["Parama Danoesubroto"]
10
+ spec.email = ["parama@datahen.com"]
11
+
12
+ spec.summary = %q{DataHen toolbelt for developers}
13
+ spec.description = %q{DataHen toolbelt to develop scrapers and other scripts}
14
+ spec.homepage = "https://datahen.com"
15
+ spec.license = "MIT"
16
+
17
+ # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
18
+ # to allow pushing to a single host or delete this section to allow pushing to any host.
19
+ if spec.respond_to?(:metadata)
20
+ spec.metadata["allowed_push_host"] = "https://rubygems.org"
21
+ spec.metadata["homepage_uri"] = spec.homepage
22
+ spec.metadata["source_code_uri"] = "https://github.com/DataHenOfficial/datahen-ruby"
23
+ else
24
+ raise "RubyGems 2.0 or newer is required to protect against " \
25
+ "public gem pushes."
26
+ end
27
+
28
+ # Specify which files should be added to the gem when it is released.
29
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
30
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
31
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
32
+ end
33
+ spec.bindir = "exe"
34
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
35
+ spec.require_paths = ["lib"]
36
+ spec.required_ruby_version = '>= 2.2.2'
37
+ spec.add_dependency "thor", "~> 0.20.3"
38
+ spec.add_dependency 'httparty', '~> 0.16.2'
39
+ spec.add_dependency 'nokogiri', '~> 1.6', '< 1.10'
40
+ spec.add_development_dependency 'bundler', '>= 1.16'
41
+ spec.add_development_dependency 'rake', '>= 10.0'
42
+ spec.add_development_dependency 'minitest', '>= 5.11'
43
+ spec.add_development_dependency 'simplecov', '>= 0.16.1'
44
+ spec.add_development_dependency 'simplecov-console', '>= 0.4.2'
45
+ spec.add_development_dependency 'timecop', '>= 0.9.1'
46
+ spec.add_development_dependency 'byebug', '>= 0'
47
+ end
@@ -0,0 +1,9 @@
1
+ class Hello
2
+ def initialize
3
+ @hi = "Hello"
4
+ end
5
+
6
+ def say
7
+ @hi
8
+ end
9
+ end
@@ -0,0 +1,10 @@
1
+ class HelloFail
2
+ def initialize
3
+ @hi = "Hello"
4
+ end
5
+
6
+ def say
7
+ raise "fail from Hello class"
8
+ @hi
9
+ end
10
+ end
@@ -0,0 +1,2 @@
1
+ raise "parsing intentionally failed"
2
+ # 1 = 1
@@ -0,0 +1,18 @@
1
+
2
+ puts
3
+ puts "list all output on a collection"
4
+ puts find_outputs('home', {},2).count
5
+ puts find_outputs('home', {})
6
+
7
+ puts "find_outputs"
8
+ puts find_outputs('home', "_id": "b3d6f737731842b2be198fc3a85283b7")
9
+ puts
10
+ puts "find_outputs not found"
11
+ puts find_outputs('home', "_id": "b3d6f737731842b2be198fc3a85283b7--").inspect
12
+ # puts nil['_collection']
13
+ puts
14
+ puts "find_output"
15
+ puts find_output('home',"_id": "b3d6f737731842b2be198fc3a85283b7")['_collection']
16
+ puts
17
+ puts "find_output not found"
18
+ puts find_output('home',"_id": "b3d6f737731842b2be198fc3a85283b7--").inspect
@@ -0,0 +1,50 @@
1
+ puts `pwd`
2
+
3
+ require './libraries/hello'
4
+
5
+ hello = Hello.new
6
+ puts "hello say #{hello.say}"
7
+ puts "page gid:#{page['gid']}"
8
+ puts "page #{page}"
9
+
10
+ puts "content #{content}"
11
+
12
+ nokogiri = Nokogiri.HTML(content)
13
+
14
+ h1 = nokogiri.at('h1')
15
+ heading = h1.nil? ? '' : h1.text
16
+ text = nokogiri.text
17
+
18
+ doc1 = {
19
+ _collection: "home",
20
+ # _id: "1234",
21
+ text: text,
22
+ heading: heading,
23
+ response_headers: page['response_headers'],
24
+ some_vars: page['vars']
25
+ # url: page.url
26
+ }
27
+ doc2 = {
28
+ _collection: "home",
29
+ # _id: "12345",
30
+ text: text,
31
+ heading: heading,
32
+ response_headers: page['response_headers'],
33
+ some_vars: page['vars']
34
+ # url: page.url
35
+ }
36
+
37
+
38
+ outputs << doc1
39
+ outputs << {}
40
+ outputs << doc2
41
+
42
+
43
+ pages << {
44
+ url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParserWithVars",
45
+ vars: {"abc":[1,2,3], "def": "defcontent"}
46
+ }
47
+
48
+ puts "inspect page: #{page}"
49
+
50
+ puts "inspect vars: #{page['vars']}"
@@ -0,0 +1,3 @@
1
+ require './libraries/hello_fail'
2
+ hello = HelloFail.new
3
+ puts "hello say #{hello.say}"
@@ -0,0 +1,14 @@
1
+ nokogiri = Nokogiri.HTML(content)
2
+
3
+ outputs << {
4
+ _collection: "home",
5
+ _id: "1234",
6
+ text: nokogiri.text,
7
+ heading: nokogiri.at('h1').text,
8
+ response_headers: page['response_headers'],
9
+ }
10
+
11
+ pages << {
12
+ url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser",
13
+ vars: {"abc":[1,2,3], "def": "defcontent"}
14
+ }
@@ -0,0 +1,12 @@
1
+ CSV.foreach("./seeders/list_of_urls.csv",:headers => true) do |row|
2
+ pages << {
3
+ url: row['url'],
4
+ page_type: row['page_type'],
5
+ vars: {"abc":[1,2,3], "def": "defcontent"}
6
+ }
7
+
8
+ # Save pages to the job partially if record counts will be too large
9
+ max_records = 100
10
+ save_pages(pages) if $. % max_records == 0
11
+ end
12
+
@@ -0,0 +1 @@
1
+ raise "fail from seeder"
@@ -0,0 +1,5 @@
1
+ url,page_type
2
+ http://fetchtest.datahen.com,home
3
+ http://fetchtest.datahen.com/statuses/200,statuses
4
+ http://fetchtest.datahen.com/statuses/200?q=1,statuses
5
+ http://fetchtest.datahen.com/statuses/200?q=2,statuses
@@ -0,0 +1,28 @@
1
+ puts "hello from seeder"
2
+
3
+ pages << {
4
+ url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser",
5
+ vars: {"abc":[1], "def": "defcontent"}
6
+ }
7
+
8
+ pages << {
9
+ url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser2",
10
+ vars: {"abc":[2], "def": "defcontent"}
11
+ }
12
+
13
+ save_pages(pages)
14
+
15
+ pages << {
16
+ url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser3",
17
+ vars: {"abc":[3], "def": "defcontent"}
18
+ }
19
+
20
+ pages << {
21
+ url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser4",
22
+ vars: {"abc":[3], "def": "defcontent"}
23
+ }
24
+
25
+ pages << {
26
+ url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser5",
27
+ vars: {"abc":[3], "def": "defcontent"}
28
+ }
@@ -0,0 +1,4 @@
1
+ pages << {
2
+ url: "http://fetchtest.datahen.com/statuses/200?q=queuedFromParser",
3
+ reset: true
4
+ }
data/exe/hen ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+ require 'datahen/cli'
3
+ Datahen::CLI.start