scraper_rb 0.0.0 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 18fb3bfb0c62f76e8c57f1d91a9c289550268a1a9b5651576bfd8e0cd378f7ef
4
- data.tar.gz: 9e4ee940c85e8b3f88463ab0d4c3c41cf5ecfe5f678711690a33a9ece00a9064
3
+ metadata.gz: d4b01521d5e4d7aee51cce39790a306c9c2be362461025c738b539fbde5d9a43
4
+ data.tar.gz: e0ebb0969b2e1c7fb151bce41824b23fc4e415931b5ce3c44286578aeb8557e4
5
5
  SHA512:
6
- metadata.gz: c2d6ecdc3cd7f6396a29acd40489912048748408484793f441d7928af4816fa0064310981913d45d04e842f7781767c078f5bfb317869a26df88bce1a4a32327
7
- data.tar.gz: 614e998ecce102cc2bbd1c22d3e7a15f3de1ff98d1676844df1c4681dad9d9662419bed7ea111a473776ae6d9b6e5c833879e99eec8fff2d6884874369a4b878
6
+ metadata.gz: ff4af0bdbbe06ff13addae47f905a08c200c96edc18d1fb05f2caf46784ecad6c684cd875ef19354dc661af2fe1a270ace55c22efb4a0bd66c7893d86f8b62e9
7
+ data.tar.gz: 4a111ec06fa205873b94a93f31924f60a3c6ddce1cbadcfc7a60fe7c76ca1c95996c39a9f93d54bcd7ee12d853673be8f10d69b54019e601106818f7b6baadfb
@@ -1,5 +1,5 @@
1
1
  [bumpversion]
2
- current_version = 0.0.0
2
+ current_version = 0.1.0
3
3
  commit = True
4
4
 
5
5
  [bumpversion:file:README.md]
@@ -12,4 +12,4 @@ replace = VERSION = "{new_version}"
12
12
 
13
13
  [bumpversion:file:Gemfile.lock]
14
14
  search = scraper_rb ({current_version})
15
- replace = scraper_rb ({new_version})
15
+ replace = scraper_rb ({new_version})
data/.gitignore CHANGED
@@ -6,3 +6,4 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
+ .env
@@ -1,25 +1,33 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- scraper_rb (0.0.0)
4
+ scraper_rb (0.1.0)
5
5
  faraday (~> 1.0, >= 1.0.1)
6
6
 
7
7
  GEM
8
8
  remote: https://rubygems.org/
9
9
  specs:
10
+ awesome_print (1.8.0)
11
+ bond (0.5.1)
12
+ colorize (0.8.1)
10
13
  faraday (1.0.1)
11
14
  multipart-post (>= 1.2, < 3)
12
15
  minitest (5.14.2)
13
16
  multipart-post (2.1.1)
14
17
  rake (12.3.3)
18
+ wirble (0.1.3)
15
19
 
16
20
  PLATFORMS
17
21
  ruby
18
22
 
19
23
  DEPENDENCIES
24
+ awesome_print (~> 1.8)
25
+ bond (~> 0.5.1)
26
+ colorize (~> 0.8.1)
20
27
  minitest (~> 5.0)
21
28
  rake (~> 12.0)
22
29
  scraper_rb!
30
+ wirble (~> 0.1.3)
23
31
 
24
32
  BUNDLED WITH
25
33
  2.1.4
data/README.md CHANGED
@@ -21,14 +21,77 @@ $ gem install scraper_rb
21
21
  or; install from GitHub:
22
22
 
23
23
  ```bash
24
- $ gem install scraper_rb --version "0.0.0" --source "https://rubygems.pkg.github.com/promptapi"
24
+ $ gem install scraper_rb --version "0.1.0" --source "https://rubygems.pkg.github.com/promptapi"
25
25
  ```
26
26
 
27
27
  ---
28
28
 
29
29
  ## Example Usage
30
30
 
31
- @wip
31
+ Basic scraper:
32
+
33
+ ```ruby
34
+ require "scraper_rb"
35
+
36
+ s = ScraperRb.new('https://pypi.org/classifiers/') # no params
37
+ s.get
38
+ s.response
39
+ # {
40
+ # :headers=>{:"Content-Length"=>...},
41
+ # :url=>"https://pypi.org/classifiers/",
42
+ # :data=>"<!DOCTYPE html>\n<html> ...",
43
+ # }
44
+
45
+ s.response[:headers] # => return response headers
46
+ s.response[:data] # => return scraped html
47
+ s.save('/tmp/data.html') # => {:file=>"/tmp/data.html", :size=>321322}
48
+
49
+ # or
50
+
51
+ save_result = s.save('/tmp/data.html')
52
+ puts save_result[:error] if save_result.key?(:error) # we have a file error
53
+ ```
54
+
55
+ You can add url parameters for extra operations. Valid parameters are:
56
+
57
+ - `auth_password`: for HTTP Realm auth password
58
+ - `auth_username`: for HTTP Realm auth username
59
+ - `cookie`: URL Encoded cookie header.
60
+ - `country`: 2 character country code. If you wish to scrape from an IP address of a specific country.
61
+ - `referer`: HTTP referer header
62
+ - `selector`: CSS style selector path such as `a.btn div li`. If `selector`
63
+ is enabled, returning result will be collection of data and saved file
64
+ will be in `.json` format.
65
+
66
+ Here is an example with using url parameters and `selector`:
67
+
68
+ ```ruby
69
+ require "scraper_rb"
70
+
71
+ params = {country: 'EE', selector: 'ul li button[data-clipboard-text]'}
72
+ s = ScraperRb.new('https://pypi.org/classifiers/', params)
73
+ s.get
74
+ s.response[:headers] # => return response headers
75
+ s.response[:data] # => return an array, collection of given selector
76
+ s.response[:data].length # => 734
77
+ s.save('/tmp/test.json') # => {:file=>"/tmp/test.json", :size=>174449}
78
+
79
+ # or
80
+
81
+ save_result = s.save('/tmp/test.json')
82
+ puts save_result[:error] if save_result.key?(:error) # we have a file error
83
+ ```
84
+
85
+ Default **timeout** value is set to `10` seconds. You can change this while
86
+ initializing the instance:
87
+
88
+ ```ruby
89
+ s = ScraperRb.new('https://pypi.org/classifiers/', {}, timeout=50)
90
+ # => 50 seconds timeout w/o params
91
+
92
+ s = ScraperRb.new('https://pypi.org/classifiers/', {country: 'EE'}, timeout=50)
93
+ # => 50 seconds timeout
94
+ ```
32
95
 
33
96
  ---
34
97
 
@@ -56,6 +119,9 @@ rake release[remote] # Create tag v0.0.0 and build and push bin_checker_rb-X.X.
56
119
  rake test # Run tests
57
120
  ```
58
121
 
122
+ - If you have `PROMPTAPI_TOKEN` you’ll have real http request based tests available.
123
+ - Set `RUBY_DEVELOPMENT` to `1` for more verbose test results
124
+
59
125
  ---
60
126
 
61
127
  ## License
@@ -1,6 +1,105 @@
1
1
  require "scraper_rb/version"
2
+ require 'faraday'
3
+ require 'json'
2
4
 
3
5
  module ScraperRb
4
6
  class Error < StandardError; end
5
- # Your code goes here...
7
+
8
+ # <3 mislav
9
+ # https://mislav.net/2011/07/faraday-advanced-http/
10
+ class CustomURLMiddleware
11
+ def initialize(app, options = {})
12
+ @app = app
13
+ @options = options
14
+ end
15
+
16
+ def call(env)
17
+ $stderr.puts "-> #{env[:url]}"
18
+ @app.call(env)
19
+ end
20
+ end
21
+
22
+ class << self
23
+ def new(url, params={}, timeout=10)
24
+ puts "params: #{params}" if ENV['RUBY_DEVELOPMENT']
25
+ puts "timeout: #{timeout}" if ENV['RUBY_DEVELOPMENT']
26
+ ScraperRb::Scraper.new(url, params, timeout)
27
+ end
28
+ end
29
+
30
+ class Scraper
31
+ VALID_PARAMS = ['auth_password', 'auth_username', 'cookie', 'country', 'referer', 'selector']
32
+
33
+ attr_accessor :options, :response
34
+
35
+ def initialize(url, params, timeout)
36
+ params = {} if params == nil
37
+ @options = {
38
+ url: ENV['PROMPTAPI_TEST_ENDPOINT'] || 'https://api.promptapi.com/scraper',
39
+ params: {url: url},
40
+ request: {timeout: timeout},
41
+ headers: {'Accept' => 'application/json', 'apikey' => ENV['PROMPTAPI_TOKEN']},
42
+ }
43
+ puts "-> params: #{params}"
44
+ params.each do |key, value|
45
+ @options[:params][key] = value if VALID_PARAMS.map(&:to_sym).include?(key)
46
+ end
47
+
48
+ @response = {}
49
+ end
50
+
51
+ def parse(body)
52
+ begin
53
+ JSON.parse(body, symbolize_names: true)
54
+ rescue JSON::ParserError
55
+ {error: "JSON decoding error"}
56
+ end
57
+ end
58
+
59
+ def get
60
+ unless @options[:headers]['apikey']
61
+ @response = {error: "You need to set PROMPTAPI_TOKEN environment variable"}
62
+ return
63
+ end
64
+
65
+ conn = Faraday.new(@options) do |c|
66
+ c.use Faraday::Response::RaiseError
67
+ c.use CustomURLMiddleware if ENV['RUBY_DEVELOPMENT']
68
+ end
69
+
70
+ begin
71
+ response = conn.get
72
+ @response = parse(response.body)
73
+ @response[:data] = @response[:"data-selector"] if @response.key?(:"data-selector")
74
+ rescue Faraday::ConnectionFailed
75
+ @response = {error: "Connection error"}
76
+ rescue Faraday::TimeoutError => e
77
+ @response = {error: e.message.capitalize}
78
+ rescue Faraday::ClientError => e
79
+ @response = {error: parse(e.response[:body])}
80
+ rescue Faraday::ServerError => e
81
+ @response = {error: e.message.capitalize}
82
+ end
83
+ end
84
+
85
+ def save(filename)
86
+ return {error: 'Data is not available'} unless @response[:data]
87
+ save_extension = '.html'
88
+ save_data = @response[:data]
89
+ if @response[:data].class == Array
90
+ save_extension = '.json'
91
+ save_data = JSON.generate(@response[:data])
92
+ end
93
+ file_dirname = File.dirname(filename)
94
+ file_basename = File.basename(filename, save_extension)
95
+ file_savename = "#{file_dirname}/#{file_basename}#{save_extension}"
96
+ begin
97
+ File.open(file_savename, 'w') {|file| file.write(save_data)}
98
+ return {file: file_savename, size: File.size(file_savename)}
99
+ rescue Errno::ENOENT => e
100
+ return {error: "#{e}"}
101
+ end
102
+ end
103
+
104
+ end
6
105
  end
@@ -1,3 +1,3 @@
1
1
  module ScraperRb
2
- VERSION = "0.0.0"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -18,5 +18,10 @@ Gem::Specification.new do |spec|
18
18
  `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
19
19
  end
20
20
  spec.require_paths = ["lib"]
21
+ spec.add_development_dependency 'wirble', '~> 0.1.3'
22
+ spec.add_development_dependency 'awesome_print', '~> 1.8'
23
+ spec.add_development_dependency 'bond', '~> 0.5.1'
24
+ spec.add_development_dependency 'colorize', '~> 0.8.1'
25
+
21
26
  spec.add_runtime_dependency 'faraday', '~> 1.0', '>= 1.0.1'
22
27
  end
metadata CHANGED
@@ -1,15 +1,71 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraper_rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Prompt API
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-04 00:00:00.000000000 Z
11
+ date: 2020-09-07 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: wirble
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.1.3
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.1.3
27
+ - !ruby/object:Gem::Dependency
28
+ name: awesome_print
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.8'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.8'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bond
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 0.5.1
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 0.5.1
55
+ - !ruby/object:Gem::Dependency
56
+ name: colorize
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 0.8.1
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 0.8.1
13
69
  - !ruby/object:Gem::Dependency
14
70
  name: faraday
15
71
  requirement: !ruby/object:Gem::Requirement