sagrone_scraper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d067420377ca0e271b6ba7f8c00f5f6ae2198b85
4
+ data.tar.gz: cc93b626d827b17e7f16fa91b7fd00b13936c318
5
+ SHA512:
6
+ metadata.gz: 804b9c719e81d87b762f1cea45c3e1919d459d7520270ac1176907e3cb14efef3f992f24f2ed71db3baa85cae7e0fb3b4c5f18394785da396df772a6eeb59755
7
+ data.tar.gz: a9b2524b7029896731942e13483e736b45115739d8cc22a7176e9afc477f6b3d460fbe8b9b88c577ae706a976ad10c30d49deba45cdfb9d65206085dd4459f3d
data/.editorconfig ADDED
@@ -0,0 +1,20 @@
1
+ # EditorConfig helps developers define and maintain consistent
2
+ # coding styles between different editors and IDEs
3
+ # editorconfig.org
4
+
5
+ root = true
6
+
7
+ [*]
8
+
9
+ # Change these settings to your own preference
10
+ indent_style = space
11
+ indent_size = 2
12
+
13
+ # We recommend you to keep these unchanged
14
+ end_of_line = lf
15
+ charset = utf-8
16
+ trim_trailing_whitespace = true
17
+ insert_final_newline = true
18
+
19
+ [*.md]
20
+ trim_trailing_whitespace = false
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /bin/
2
+ /.bundle/
3
+ /vendor/bundle
4
+ /Gemfile.lock
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --require spec_helper
data/.travis.yml ADDED
@@ -0,0 +1,9 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.3
4
+ - 2.1.2
5
+ script:
6
+ - "bundle exec rspec --color --format documentation"
7
+ notifications:
8
+ recipients:
9
+ - marius.colacioiu@gmail.com
data/CHANGELOG.md ADDED
@@ -0,0 +1,5 @@
1
+ ### HEAD
2
+
3
+ ### 0.0.1
4
+
5
+ - add `SagroneScraper::Agent`
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/Guardfile ADDED
@@ -0,0 +1,15 @@
1
+ clearing :on
2
+
3
+ guard :rspec, cmd: "bundle exec rspec" do
4
+ require "guard/rspec/dsl"
5
+ dsl = Guard::RSpec::Dsl.new(self)
6
+
7
+ # RSpec files
8
+ rspec = dsl.rspec
9
+ watch(rspec.spec_files)
10
+ watch(%r{^spec/(.+)_helper\.rb$}) { "spec" }
11
+ watch(%r{^spec/test_responses/(.+)$}) { "spec" }
12
+
13
+ # Library files
14
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
15
+ end
data/LICENSE ADDED
@@ -0,0 +1,13 @@
1
+ Copyright 2015 Marius Colacioiu
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
data/README.md ADDED
@@ -0,0 +1,64 @@
1
+ # SagroneScraper
2
+
3
+ Simple library to scrap web pages. Bellow you will find information on [how to use it](#usage).
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ $ gem 'sagrone_scraper'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install sagrone_scraper
18
+
19
+ ## Usage
20
+
21
+ #### `SagroneScraper::Agent`
22
+
23
+ The agent is responsible for scraping a web page from a URL.
24
+
25
+ Here is how you can create an `agent`:
26
+
27
+ 1. one way is to pass it a `url` option
28
+
29
+ ```ruby
30
+ require 'sagrone_scraper/agent'
31
+
32
+ agent = SagroneScraper::Agent.new(url: 'https://twitter.com/Milano_JS')
33
+ agent.page
34
+ # => Mechanize::Page
35
+
36
+ agent.page.at('.ProfileHeaderCard-bio').text
37
+ # => "Javascript User Group Milano #milanojs"
38
+ ```
39
+
40
+ 2. another way is to pass a `page` option (`Mechanize::Page`)
41
+
42
+ ```ruby
43
+ require 'sagrone_scraper/agent'
44
+
45
+ mechanize_agent = Mechanize.new { |agent| agent.user_agent_alias = 'Linux Firefox' }
46
+ page = mechanize_agent.get('https://twitter.com/Milano_JS')
47
+ # => Mechanize::Page
48
+
49
+ agent = SagroneScraper::Agent.new(page: page)
50
+ agent.url
51
+ # => "https://twitter.com/Milano_JS"
52
+
53
+ agent.page.at('.ProfileHeaderCard-locationText').text
54
+ # => "Milan, Italy"
55
+ ```
56
+
57
+
58
+ ## Contributing
59
+
60
+ 1. Fork it ( https://github.com/[my-github-username]/sagrone_scraper/fork )
61
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
62
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
63
+ 4. Push to the branch (`git push origin my-new-feature`)
64
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,46 @@
1
+ require 'mechanize'
2
+
3
+ module SagroneScraper
4
+ class Agent
5
+ Error = Class.new(RuntimeError)
6
+
7
+ AGENT_ALIASES = ["Linux Firefox", "Linux Mozilla", "Mac Firefox", "Mac Mozilla", "Mac Safari", "Windows Chrome", "Windows IE 8", "Windows IE 9", "Windows Mozilla"]
8
+
9
+ attr_reader :url, :page
10
+
11
+ def initialize(options = {})
12
+ raise Error.new('Exactly one option must be provided: "url" or "page"') unless exactly_one_of(options)
13
+
14
+ @url, @page = options[:url], options[:page]
15
+
16
+ @url ||= page_url
17
+ @page ||= http_client.get(url)
18
+ rescue StandardError => error
19
+ raise Error.new(error.message)
20
+ end
21
+
22
+ def http_client
23
+ @http_client ||= self.class.http_client
24
+ end
25
+
26
+ def self.http_client
27
+ Mechanize.new do |agent|
28
+ agent.user_agent_alias = AGENT_ALIASES.sample
29
+ agent.max_history = 0
30
+ end
31
+ end
32
+
33
+ private
34
+
35
+ def page_url
36
+ @page.uri.to_s
37
+ end
38
+
39
+ def exactly_one_of(options)
40
+ url_present = !!options[:url]
41
+ page_present = !!options[:page]
42
+
43
+ (url_present && !page_present) || (!url_present && page_present)
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,3 @@
1
+ module SagroneScraper
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,7 @@
1
+ require "sagrone_scraper/version"
2
+
3
+ module SagroneScraper
4
+ def self.version
5
+ VERSION
6
+ end
7
+ end
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ require 'sagrone_scraper/version'
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = "sagrone_scraper"
9
+ spec.version = SagroneScraper::VERSION
10
+ spec.authors = ["Marius Colacioiu"]
11
+ spec.email = ["marius.colacioiu@gmail.com"]
12
+ spec.summary = %q{Sagrone Ruby Scraper.}
13
+ spec.description = %q{Simple library to scrap web pages.}
14
+ spec.homepage = ""
15
+ spec.license = "Apache License 2.0"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0")
18
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
+ spec.test_files = spec.files.grep(%r{^(spec)/})
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_dependency "mechanize", "~> 2.0"
23
+
24
+ spec.add_development_dependency "bundler"
25
+ spec.add_development_dependency "guard-rspec"
26
+ spec.add_development_dependency "rake"
27
+ spec.add_development_dependency "rspec"
28
+ spec.add_development_dependency "webmock"
29
+ end
@@ -0,0 +1,105 @@
1
+ require 'spec_helper'
2
+ require 'sagrone_scraper/agent'
3
+
4
+ RSpec.describe SagroneScraper::Agent do
5
+ let(:user_agent_aliases) do
6
+ [ "Linux Firefox", "Linux Mozilla",
7
+ "Mac Firefox", "Mac Mozilla", "Mac Safari",
8
+ "Windows Chrome", "Windows IE 8", "Windows IE 9", "Windows Mozilla" ]
9
+ end
10
+
11
+ describe 'AGENT_ALIASES' do
12
+ it { expect(described_class::AGENT_ALIASES).to eq(user_agent_aliases) }
13
+ end
14
+
15
+ describe '.http_client' do
16
+ subject { described_class.http_client }
17
+
18
+ it { should be_a(Mechanize) }
19
+ it { should respond_to(:get) }
20
+ it { expect(subject.user_agent).to match(/Mozilla\/5\.0/) }
21
+ end
22
+
23
+ describe '#initialize' do
24
+ describe 'should require exactly one of `url` or `page` option' do
25
+ before do
26
+ stub_request_for('http://example.com', 'www.example.com')
27
+ end
28
+
29
+ it 'when options is empty' do
30
+ expect { described_class.new }.to raise_error(SagroneScraper::Agent::Error,
31
+ /Exactly one option must be provided: "url" or "page"/)
32
+ end
33
+
34
+ it 'when both options are present' do
35
+ page = Mechanize.new.get('http://example.com')
36
+
37
+ expect {
38
+ described_class.new(url: 'http://example.com', page: page)
39
+ }.to raise_error(SagroneScraper::Agent::Error,
40
+ /Exactly one option must be provided: "url" or "page"/)
41
+ end
42
+ end
43
+
44
+ describe 'with page option' do
45
+ before do
46
+ stub_request_for('http://example.com', 'www.example.com')
47
+ end
48
+
49
+ let(:page) { Mechanize.new.get('http://example.com') }
50
+ let(:agent) { described_class.new(page: page) }
51
+
52
+ it { expect { agent }.to_not raise_error }
53
+ it { expect(agent.page).to be }
54
+ it { expect(agent.url).to eq 'http://example.com/' }
55
+ end
56
+
57
+ describe 'with invalid URL' do
58
+ let(:agent) { described_class.new(url: @invalid_url) }
59
+
60
+ it 'should require URL is absolute' do
61
+ @invalid_url = 'not-a-url'
62
+
63
+ expect { agent }.to raise_error(SagroneScraper::Agent::Error,
64
+ /absolute URL needed \(not not-a-url\)/)
65
+ end
66
+
67
+ it 'should require absolute path' do
68
+ @invalid_url = 'http://'
69
+
70
+ webmock_allow do
71
+ expect { agent }.to raise_error(SagroneScraper::Agent::Error,
72
+ /bad URI\(absolute but no path\)/)
73
+ end
74
+ end
75
+
76
+ it 'should require valid URL' do
77
+ @invalid_url = 'http://example'
78
+
79
+ webmock_allow do
80
+ expect { agent }.to raise_error(SagroneScraper::Agent::Error,
81
+ /getaddrinfo: nodename nor servname provided, or not known/)
82
+ end
83
+ end
84
+ end
85
+
86
+ describe 'with valid URL' do
87
+ before do
88
+ stub_request_for('http://example.com', 'www.example.com')
89
+ end
90
+
91
+ let(:agent) { described_class.new(url: 'http://example.com') }
92
+
93
+ it { expect(agent.http_client).to be_a(Mechanize) }
94
+ it { expect(agent.http_client).to equal(agent.http_client) }
95
+
96
+ it { expect { agent }.to_not raise_error }
97
+ it { expect(agent.url).to eq('http://example.com') }
98
+
99
+ it { expect(agent.page).to be_a(Mechanize::Page) }
100
+ it { expect(agent.page).to equal(agent.page) }
101
+ it { expect(agent.page).to respond_to(:at, :body, :title) }
102
+ it { expect(agent.page).to respond_to(:links, :labels, :images, :image_urls, :forms) }
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,8 @@
1
+ require 'spec_helper'
2
+ require 'sagrone_scraper'
3
+
4
+ RSpec.describe SagroneScraper do
5
+ describe '.version' do
6
+ it { expect(SagroneScraper.version).to be_a(String) }
7
+ end
8
+ end
@@ -0,0 +1,24 @@
1
+ require 'stub_helper'
2
+
3
+ RSpec.configure do |config|
4
+ config.include(StubHelper)
5
+
6
+ config.expect_with :rspec do |expectations|
7
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
8
+ end
9
+
10
+ config.mock_with :rspec do |mocks|
11
+ mocks.verify_partial_doubles = true
12
+ end
13
+
14
+ config.filter_run :focus
15
+ config.run_all_when_everything_filtered = true
16
+ config.disable_monkey_patching!
17
+ config.warnings = true
18
+
19
+ if config.files_to_run.one?
20
+ config.default_formatter = 'doc'
21
+ end
22
+
23
+ config.order = :random
24
+ end
@@ -0,0 +1,22 @@
1
+ require 'webmock/rspec'
2
+
3
+ module StubHelper
4
+ def stub_request_for(url, file_name)
5
+ stub_request(:get, url)
6
+ .to_return({
7
+ body: get_response_file(file_name),
8
+ headers: {'content-type' => 'text/html'},
9
+ status: 200
10
+ })
11
+ end
12
+
13
+ def webmock_allow(&block)
14
+ WebMock.allow_net_connect!
15
+ block.call
16
+ WebMock.disable_net_connect!
17
+ end
18
+
19
+ def get_response_file(name)
20
+ IO.read(File.join('spec/test_responses', "#{name}"))
21
+ end
22
+ end
@@ -0,0 +1,51 @@
1
+ <!DOCTYPE html>
2
+ <!-- saved from url=(0019)http://example.com/ -->
3
+ <html><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
4
+ <title>Example Domain</title>
5
+
6
+ <meta charset="utf-8">
7
+
8
+ <meta name="viewport" content="width=device-width, initial-scale=1">
9
+ <style type="text/css">
10
+ body {
11
+ background-color: #f0f0f2;
12
+ margin: 0;
13
+ padding: 0;
14
+ font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
15
+
16
+ }
17
+ div {
18
+ width: 600px;
19
+ margin: 5em auto;
20
+ padding: 50px;
21
+ background-color: #fff;
22
+ border-radius: 1em;
23
+ }
24
+ a:link, a:visited {
25
+ color: #38488f;
26
+ text-decoration: none;
27
+ }
28
+ @media (max-width: 700px) {
29
+ body {
30
+ background-color: #fff;
31
+ }
32
+ div {
33
+ width: auto;
34
+ margin: 0 auto;
35
+ border-radius: 0;
36
+ padding: 1em;
37
+ }
38
+ }
39
+ </style>
40
+ </head>
41
+
42
+ <body>
43
+ <div>
44
+ <h1>Example Domain</h1>
45
+ <p>This domain is established to be used for illustrative examples in documents. You may use this
46
+ domain in examples without prior coordination or asking for permission.</p>
47
+ <p><a href="http://www.iana.org/domains/example">More information...</a></p>
48
+ </div>
49
+
50
+
51
+ </body></html>
metadata ADDED
@@ -0,0 +1,152 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sagrone_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Marius Colacioiu
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-03-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: mechanize
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: guard-rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: webmock
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: Simple library to scrap web pages.
98
+ email:
99
+ - marius.colacioiu@gmail.com
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - ".editorconfig"
105
+ - ".gitignore"
106
+ - ".rspec"
107
+ - ".travis.yml"
108
+ - CHANGELOG.md
109
+ - Gemfile
110
+ - Guardfile
111
+ - LICENSE
112
+ - README.md
113
+ - Rakefile
114
+ - lib/sagrone_scraper.rb
115
+ - lib/sagrone_scraper/agent.rb
116
+ - lib/sagrone_scraper/version.rb
117
+ - sagrone_scraper.gemspec
118
+ - spec/sagrone_scraper/agent_spec.rb
119
+ - spec/sagrone_scraper_spec.rb
120
+ - spec/spec_helper.rb
121
+ - spec/stub_helper.rb
122
+ - spec/test_responses/www.example.com
123
+ homepage: ''
124
+ licenses:
125
+ - Apache License 2.0
126
+ metadata: {}
127
+ post_install_message:
128
+ rdoc_options: []
129
+ require_paths:
130
+ - lib
131
+ required_ruby_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ required_rubygems_version: !ruby/object:Gem::Requirement
137
+ requirements:
138
+ - - ">="
139
+ - !ruby/object:Gem::Version
140
+ version: '0'
141
+ requirements: []
142
+ rubyforge_project:
143
+ rubygems_version: 2.2.2
144
+ signing_key:
145
+ specification_version: 4
146
+ summary: Sagrone Ruby Scraper.
147
+ test_files:
148
+ - spec/sagrone_scraper/agent_spec.rb
149
+ - spec/sagrone_scraper_spec.rb
150
+ - spec/spec_helper.rb
151
+ - spec/stub_helper.rb
152
+ - spec/test_responses/www.example.com