sagrone_scraper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d067420377ca0e271b6ba7f8c00f5f6ae2198b85
4
+ data.tar.gz: cc93b626d827b17e7f16fa91b7fd00b13936c318
5
+ SHA512:
6
+ metadata.gz: 804b9c719e81d87b762f1cea45c3e1919d459d7520270ac1176907e3cb14efef3f992f24f2ed71db3baa85cae7e0fb3b4c5f18394785da396df772a6eeb59755
7
+ data.tar.gz: a9b2524b7029896731942e13483e736b45115739d8cc22a7176e9afc477f6b3d460fbe8b9b88c577ae706a976ad10c30d49deba45cdfb9d65206085dd4459f3d
data/.editorconfig ADDED
@@ -0,0 +1,20 @@
1
+ # EditorConfig helps developers define and maintain consistent
2
+ # coding styles between different editors and IDEs
3
+ # editorconfig.org
4
+
5
+ root = true
6
+
7
+ [*]
8
+
9
+ # Change these settings to your own preference
10
+ indent_style = space
11
+ indent_size = 2
12
+
13
+ # We recommend you to keep these unchanged
14
+ end_of_line = lf
15
+ charset = utf-8
16
+ trim_trailing_whitespace = true
17
+ insert_final_newline = true
18
+
19
+ [*.md]
20
+ trim_trailing_whitespace = false
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /bin/
2
+ /.bundle/
3
+ /vendor/bundle
4
+ /Gemfile.lock
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --require spec_helper
data/.travis.yml ADDED
@@ -0,0 +1,9 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.3
4
+ - 2.1.2
5
+ script:
6
+ - "bundle exec rspec --color --format documentation"
7
+ notifications:
8
+ recipients:
9
+ - marius.colacioiu@gmail.com
data/CHANGELOG.md ADDED
@@ -0,0 +1,5 @@
1
+ ### HEAD
2
+
3
+ ### 0.0.1
4
+
5
+ - add `SagroneScraper::Agent`
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/Guardfile ADDED
@@ -0,0 +1,15 @@
1
+ clearing :on
2
+
3
+ guard :rspec, cmd: "bundle exec rspec" do
4
+ require "guard/rspec/dsl"
5
+ dsl = Guard::RSpec::Dsl.new(self)
6
+
7
+ # RSpec files
8
+ rspec = dsl.rspec
9
+ watch(rspec.spec_files)
10
+ watch(%r{^spec/(.+)_helper\.rb$}) { "spec" }
11
+ watch(%r{^spec/test_responses/(.+)$}) { "spec" }
12
+
13
+ # Library files
14
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
15
+ end
data/LICENSE ADDED
@@ -0,0 +1,13 @@
1
+ Copyright 2015 Marius Colacioiu
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
data/README.md ADDED
@@ -0,0 +1,64 @@
1
+ # SagroneScraper
2
+
3
+ Simple library to scrap web pages. Bellow you will find information on [how to use it](#usage).
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ $ gem 'sagrone_scraper'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install sagrone_scraper
18
+
19
+ ## Usage
20
+
21
+ #### `SagroneScraper::Agent`
22
+
23
+ The agent is responsible for scraping a web page from a URL.
24
+
25
+ Here is how you can create an `agent`:
26
+
27
+ 1. one way is to pass it a `url` option
28
+
29
+ ```ruby
30
+ require 'sagrone_scraper/agent'
31
+
32
+ agent = SagroneScraper::Agent.new(url: 'https://twitter.com/Milano_JS')
33
+ agent.page
34
+ # => Mechanize::Page
35
+
36
+ agent.page.at('.ProfileHeaderCard-bio').text
37
+ # => "Javascript User Group Milano #milanojs"
38
+ ```
39
+
40
+ 2. another way is to pass a `page` option (`Mechanize::Page`)
41
+
42
+ ```ruby
43
+ require 'sagrone_scraper/agent'
44
+
45
+ mechanize_agent = Mechanize.new { |agent| agent.user_agent_alias = 'Linux Firefox' }
46
+ page = mechanize_agent.get('https://twitter.com/Milano_JS')
47
+ # => Mechanize::Page
48
+
49
+ agent = SagroneScraper::Agent.new(page: page)
50
+ agent.url
51
+ # => "https://twitter.com/Milano_JS"
52
+
53
+ agent.page.at('.ProfileHeaderCard-locationText').text
54
+ # => "Milan, Italy"
55
+ ```
56
+
57
+
58
+ ## Contributing
59
+
60
+ 1. Fork it ( https://github.com/[my-github-username]/sagrone_scraper/fork )
61
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
62
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
63
+ 4. Push to the branch (`git push origin my-new-feature`)
64
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,46 @@
1
+ require 'mechanize'
2
+
3
+ module SagroneScraper
4
+ class Agent
5
+ Error = Class.new(RuntimeError)
6
+
7
+ AGENT_ALIASES = ["Linux Firefox", "Linux Mozilla", "Mac Firefox", "Mac Mozilla", "Mac Safari", "Windows Chrome", "Windows IE 8", "Windows IE 9", "Windows Mozilla"]
8
+
9
+ attr_reader :url, :page
10
+
11
+ def initialize(options = {})
12
+ raise Error.new('Exactly one option must be provided: "url" or "page"') unless exactly_one_of(options)
13
+
14
+ @url, @page = options[:url], options[:page]
15
+
16
+ @url ||= page_url
17
+ @page ||= http_client.get(url)
18
+ rescue StandardError => error
19
+ raise Error.new(error.message)
20
+ end
21
+
22
+ def http_client
23
+ @http_client ||= self.class.http_client
24
+ end
25
+
26
+ def self.http_client
27
+ Mechanize.new do |agent|
28
+ agent.user_agent_alias = AGENT_ALIASES.sample
29
+ agent.max_history = 0
30
+ end
31
+ end
32
+
33
+ private
34
+
35
+ def page_url
36
+ @page.uri.to_s
37
+ end
38
+
39
+ def exactly_one_of(options)
40
+ url_present = !!options[:url]
41
+ page_present = !!options[:page]
42
+
43
+ (url_present && !page_present) || (!url_present && page_present)
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,3 @@
1
+ module SagroneScraper
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,7 @@
1
+ require "sagrone_scraper/version"
2
+
3
+ module SagroneScraper
4
+ def self.version
5
+ VERSION
6
+ end
7
+ end
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ require 'sagrone_scraper/version'
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = "sagrone_scraper"
9
+ spec.version = SagroneScraper::VERSION
10
+ spec.authors = ["Marius Colacioiu"]
11
+ spec.email = ["marius.colacioiu@gmail.com"]
12
+ spec.summary = %q{Sagrone Ruby Scraper.}
13
+ spec.description = %q{Simple library to scrap web pages.}
14
+ spec.homepage = ""
15
+ spec.license = "Apache License 2.0"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0")
18
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
+ spec.test_files = spec.files.grep(%r{^(spec)/})
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_dependency "mechanize", "~> 2.0"
23
+
24
+ spec.add_development_dependency "bundler"
25
+ spec.add_development_dependency "guard-rspec"
26
+ spec.add_development_dependency "rake"
27
+ spec.add_development_dependency "rspec"
28
+ spec.add_development_dependency "webmock"
29
+ end
@@ -0,0 +1,105 @@
1
+ require 'spec_helper'
2
+ require 'sagrone_scraper/agent'
3
+
4
+ RSpec.describe SagroneScraper::Agent do
5
+ let(:user_agent_aliases) do
6
+ [ "Linux Firefox", "Linux Mozilla",
7
+ "Mac Firefox", "Mac Mozilla", "Mac Safari",
8
+ "Windows Chrome", "Windows IE 8", "Windows IE 9", "Windows Mozilla" ]
9
+ end
10
+
11
+ describe 'AGENT_ALIASES' do
12
+ it { expect(described_class::AGENT_ALIASES).to eq(user_agent_aliases) }
13
+ end
14
+
15
+ describe '.http_client' do
16
+ subject { described_class.http_client }
17
+
18
+ it { should be_a(Mechanize) }
19
+ it { should respond_to(:get) }
20
+ it { expect(subject.user_agent).to match(/Mozilla\/5\.0/) }
21
+ end
22
+
23
+ describe '#initialize' do
24
+ describe 'should require exactly one of `url` or `page` option' do
25
+ before do
26
+ stub_request_for('http://example.com', 'www.example.com')
27
+ end
28
+
29
+ it 'when options is empty' do
30
+ expect { described_class.new }.to raise_error(SagroneScraper::Agent::Error,
31
+ /Exactly one option must be provided: "url" or "page"/)
32
+ end
33
+
34
+ it 'when both options are present' do
35
+ page = Mechanize.new.get('http://example.com')
36
+
37
+ expect {
38
+ described_class.new(url: 'http://example.com', page: page)
39
+ }.to raise_error(SagroneScraper::Agent::Error,
40
+ /Exactly one option must be provided: "url" or "page"/)
41
+ end
42
+ end
43
+
44
+ describe 'with page option' do
45
+ before do
46
+ stub_request_for('http://example.com', 'www.example.com')
47
+ end
48
+
49
+ let(:page) { Mechanize.new.get('http://example.com') }
50
+ let(:agent) { described_class.new(page: page) }
51
+
52
+ it { expect { agent }.to_not raise_error }
53
+ it { expect(agent.page).to be }
54
+ it { expect(agent.url).to eq 'http://example.com/' }
55
+ end
56
+
57
+ describe 'with invalid URL' do
58
+ let(:agent) { described_class.new(url: @invalid_url) }
59
+
60
+ it 'should require URL is absolute' do
61
+ @invalid_url = 'not-a-url'
62
+
63
+ expect { agent }.to raise_error(SagroneScraper::Agent::Error,
64
+ /absolute URL needed \(not not-a-url\)/)
65
+ end
66
+
67
+ it 'should require absolute path' do
68
+ @invalid_url = 'http://'
69
+
70
+ webmock_allow do
71
+ expect { agent }.to raise_error(SagroneScraper::Agent::Error,
72
+ /bad URI\(absolute but no path\)/)
73
+ end
74
+ end
75
+
76
+ it 'should require valid URL' do
77
+ @invalid_url = 'http://example'
78
+
79
+ webmock_allow do
80
+ expect { agent }.to raise_error(SagroneScraper::Agent::Error,
81
+ /getaddrinfo: nodename nor servname provided, or not known/)
82
+ end
83
+ end
84
+ end
85
+
86
+ describe 'with valid URL' do
87
+ before do
88
+ stub_request_for('http://example.com', 'www.example.com')
89
+ end
90
+
91
+ let(:agent) { described_class.new(url: 'http://example.com') }
92
+
93
+ it { expect(agent.http_client).to be_a(Mechanize) }
94
+ it { expect(agent.http_client).to equal(agent.http_client) }
95
+
96
+ it { expect { agent }.to_not raise_error }
97
+ it { expect(agent.url).to eq('http://example.com') }
98
+
99
+ it { expect(agent.page).to be_a(Mechanize::Page) }
100
+ it { expect(agent.page).to equal(agent.page) }
101
+ it { expect(agent.page).to respond_to(:at, :body, :title) }
102
+ it { expect(agent.page).to respond_to(:links, :labels, :images, :image_urls, :forms) }
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,8 @@
1
+ require 'spec_helper'
2
+ require 'sagrone_scraper'
3
+
4
+ RSpec.describe SagroneScraper do
5
+ describe '.version' do
6
+ it { expect(SagroneScraper.version).to be_a(String) }
7
+ end
8
+ end
@@ -0,0 +1,24 @@
1
+ require 'stub_helper'
2
+
3
+ RSpec.configure do |config|
4
+ config.include(StubHelper)
5
+
6
+ config.expect_with :rspec do |expectations|
7
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
8
+ end
9
+
10
+ config.mock_with :rspec do |mocks|
11
+ mocks.verify_partial_doubles = true
12
+ end
13
+
14
+ config.filter_run :focus
15
+ config.run_all_when_everything_filtered = true
16
+ config.disable_monkey_patching!
17
+ config.warnings = true
18
+
19
+ if config.files_to_run.one?
20
+ config.default_formatter = 'doc'
21
+ end
22
+
23
+ config.order = :random
24
+ end
@@ -0,0 +1,22 @@
1
+ require 'webmock/rspec'
2
+
3
+ module StubHelper
4
+ def stub_request_for(url, file_name)
5
+ stub_request(:get, url)
6
+ .to_return({
7
+ body: get_response_file(file_name),
8
+ headers: {'content-type' => 'text/html'},
9
+ status: 200
10
+ })
11
+ end
12
+
13
+ def webmock_allow(&block)
14
+ WebMock.allow_net_connect!
15
+ block.call
16
+ WebMock.disable_net_connect!
17
+ end
18
+
19
+ def get_response_file(name)
20
+ IO.read(File.join('spec/test_responses', "#{name}"))
21
+ end
22
+ end
@@ -0,0 +1,51 @@
1
+ <!DOCTYPE html>
2
+ <!-- saved from url=(0019)http://example.com/ -->
3
+ <html><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
4
+ <title>Example Domain</title>
5
+
6
+ <meta charset="utf-8">
7
+
8
+ <meta name="viewport" content="width=device-width, initial-scale=1">
9
+ <style type="text/css">
10
+ body {
11
+ background-color: #f0f0f2;
12
+ margin: 0;
13
+ padding: 0;
14
+ font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
15
+
16
+ }
17
+ div {
18
+ width: 600px;
19
+ margin: 5em auto;
20
+ padding: 50px;
21
+ background-color: #fff;
22
+ border-radius: 1em;
23
+ }
24
+ a:link, a:visited {
25
+ color: #38488f;
26
+ text-decoration: none;
27
+ }
28
+ @media (max-width: 700px) {
29
+ body {
30
+ background-color: #fff;
31
+ }
32
+ div {
33
+ width: auto;
34
+ margin: 0 auto;
35
+ border-radius: 0;
36
+ padding: 1em;
37
+ }
38
+ }
39
+ </style>
40
+ </head>
41
+
42
+ <body>
43
+ <div>
44
+ <h1>Example Domain</h1>
45
+ <p>This domain is established to be used for illustrative examples in documents. You may use this
46
+ domain in examples without prior coordination or asking for permission.</p>
47
+ <p><a href="http://www.iana.org/domains/example">More information...</a></p>
48
+ </div>
49
+
50
+
51
+ </body></html>
metadata ADDED
@@ -0,0 +1,152 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sagrone_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Marius Colacioiu
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-03-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: mechanize
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: guard-rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: webmock
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: Simple library to scrap web pages.
98
+ email:
99
+ - marius.colacioiu@gmail.com
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - ".editorconfig"
105
+ - ".gitignore"
106
+ - ".rspec"
107
+ - ".travis.yml"
108
+ - CHANGELOG.md
109
+ - Gemfile
110
+ - Guardfile
111
+ - LICENSE
112
+ - README.md
113
+ - Rakefile
114
+ - lib/sagrone_scraper.rb
115
+ - lib/sagrone_scraper/agent.rb
116
+ - lib/sagrone_scraper/version.rb
117
+ - sagrone_scraper.gemspec
118
+ - spec/sagrone_scraper/agent_spec.rb
119
+ - spec/sagrone_scraper_spec.rb
120
+ - spec/spec_helper.rb
121
+ - spec/stub_helper.rb
122
+ - spec/test_responses/www.example.com
123
+ homepage: ''
124
+ licenses:
125
+ - Apache License 2.0
126
+ metadata: {}
127
+ post_install_message:
128
+ rdoc_options: []
129
+ require_paths:
130
+ - lib
131
+ required_ruby_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ required_rubygems_version: !ruby/object:Gem::Requirement
137
+ requirements:
138
+ - - ">="
139
+ - !ruby/object:Gem::Version
140
+ version: '0'
141
+ requirements: []
142
+ rubyforge_project:
143
+ rubygems_version: 2.2.2
144
+ signing_key:
145
+ specification_version: 4
146
+ summary: Sagrone Ruby Scraper.
147
+ test_files:
148
+ - spec/sagrone_scraper/agent_spec.rb
149
+ - spec/sagrone_scraper_spec.rb
150
+ - spec/spec_helper.rb
151
+ - spec/stub_helper.rb
152
+ - spec/test_responses/www.example.com