RubyGems - sagrone_scraper - Versions diffs - 0.0.1 - Mend

sagrone_scraper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +7 -0
data/.editorconfig +20 -0
data/.gitignore +9 -0
data/.rspec +2 -0
data/.travis.yml +9 -0
data/CHANGELOG.md +5 -0
data/Gemfile +3 -0
data/Guardfile +15 -0
data/LICENSE +13 -0
data/README.md +64 -0
data/Rakefile +6 -0
data/lib/sagrone_scraper/agent.rb +46 -0
data/lib/sagrone_scraper/version.rb +3 -0
data/lib/sagrone_scraper.rb +7 -0
data/sagrone_scraper.gemspec +29 -0
data/spec/sagrone_scraper/agent_spec.rb +105 -0
data/spec/sagrone_scraper_spec.rb +8 -0
data/spec/spec_helper.rb +24 -0
data/spec/stub_helper.rb +22 -0
data/spec/test_responses/www.example.com +51 -0
metadata +152 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: d067420377ca0e271b6ba7f8c00f5f6ae2198b85
+  data.tar.gz: cc93b626d827b17e7f16fa91b7fd00b13936c318
+SHA512:
+  metadata.gz: 804b9c719e81d87b762f1cea45c3e1919d459d7520270ac1176907e3cb14efef3f992f24f2ed71db3baa85cae7e0fb3b4c5f18394785da396df772a6eeb59755
+  data.tar.gz: a9b2524b7029896731942e13483e736b45115739d8cc22a7176e9afc477f6b3d460fbe8b9b88c577ae706a976ad10c30d49deba45cdfb9d65206085dd4459f3d

data/.editorconfig ADDED Viewed

@@ -0,0 +1,20 @@
+# EditorConfig helps developers define and maintain consistent
+# coding styles between different editors and IDEs
+# editorconfig.org
+root = true
+[*]
+# Change these settings to your own preference
+indent_style = space
+indent_size = 2
+# We recommend you to keep these unchanged
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
+[*.md]
+trim_trailing_whitespace = false

data/.gitignore ADDED Viewed

@@ -0,0 +1,9 @@
+/bin/
+/.bundle/
+/vendor/bundle
+/Gemfile.lock
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/

data/.rspec ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ --color
2	+ --require spec_helper

data/.travis.yml ADDED Viewed

@@ -0,0 +1,9 @@
+language: ruby
+rvm:
+  - 1.9.3
+  - 2.1.2
+script:
+  - "bundle exec rspec --color --format documentation"
+notifications:
+  recipients:
+    - marius.colacioiu@gmail.com

data/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,5 @@
+### HEAD
+### 0.0.1
+- add `SagroneScraper::Agent`

data/Gemfile ADDED Viewed

@@ -0,0 +1,3 @@
+source 'https://rubygems.org'
+gemspec

data/Guardfile ADDED Viewed

@@ -0,0 +1,15 @@
+clearing :on
+guard :rspec, cmd: "bundle exec rspec" do
+  require "guard/rspec/dsl"
+  dsl = Guard::RSpec::Dsl.new(self)
+  # RSpec files
+  rspec = dsl.rspec
+  watch(rspec.spec_files)
+  watch(%r{^spec/(.+)_helper\.rb$}) { "spec" }
+  watch(%r{^spec/test_responses/(.+)$}) { "spec" }
+  # Library files
+  watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
+end

data/LICENSE ADDED Viewed

@@ -0,0 +1,13 @@
+Copyright 2015 Marius Colacioiu
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.

data/README.md ADDED Viewed

@@ -0,0 +1,64 @@
+# SagroneScraper
+Simple library to scrap web pages. Bellow you will find information on [how to use it](#usage).
+## Installation
+Add this line to your application's Gemfile:
+    $ gem 'sagrone_scraper'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install sagrone_scraper
+## Usage
+#### `SagroneScraper::Agent`
+The agent is responsible for scraping a web page from a URL.
+Here is how you can create an `agent`:
+1. one way is to pass it a `url` option
+    ```ruby
+    require 'sagrone_scraper/agent'
+    agent = SagroneScraper::Agent.new(url: 'https://twitter.com/Milano_JS')
+    agent.page
+    # => Mechanize::Page
+    agent.page.at('.ProfileHeaderCard-bio').text
+    # => "Javascript User Group Milano #milanojs"
+    ```
+2. another way is to pass a `page` option (`Mechanize::Page`)
+    ```ruby
+    require 'sagrone_scraper/agent'
+    mechanize_agent = Mechanize.new { |agent| agent.user_agent_alias = 'Linux Firefox' }
+    page = mechanize_agent.get('https://twitter.com/Milano_JS')
+    # => Mechanize::Page
+    agent = SagroneScraper::Agent.new(page: page)
+    agent.url
+    # => "https://twitter.com/Milano_JS"
+    agent.page.at('.ProfileHeaderCard-locationText').text
+    # => "Milan, Italy"
+    ```
+## Contributing
+1. Fork it ( https://github.com/[my-github-username]/sagrone_scraper/fork )
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create a new Pull Request

data/Rakefile ADDED Viewed

@@ -0,0 +1,6 @@
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec

data/lib/sagrone_scraper/agent.rb ADDED Viewed

@@ -0,0 +1,46 @@
+require 'mechanize'
+module SagroneScraper
+  class Agent
+    Error = Class.new(RuntimeError)
+    AGENT_ALIASES = ["Linux Firefox", "Linux Mozilla", "Mac Firefox", "Mac Mozilla", "Mac Safari", "Windows Chrome", "Windows IE 8", "Windows IE 9", "Windows Mozilla"]
+    attr_reader :url, :page
+    def initialize(options = {})
+      raise Error.new('Exactly one option must be provided: "url" or "page"') unless exactly_one_of(options)
+      @url, @page = options[:url], options[:page]
+      @url ||= page_url
+      @page ||= http_client.get(url)
+    rescue StandardError => error
+      raise Error.new(error.message)
+    end
+    def http_client
+      @http_client ||= self.class.http_client
+    end
+    def self.http_client
+      Mechanize.new do |agent|
+        agent.user_agent_alias = AGENT_ALIASES.sample
+        agent.max_history = 0
+      end
+    end
+    private
+    def page_url
+      @page.uri.to_s
+    end
+    def exactly_one_of(options)
+      url_present = !!options[:url]
+      page_present = !!options[:page]
+      (url_present && !page_present) || (!url_present && page_present)
+    end
+  end
+end

data/lib/sagrone_scraper/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module SagroneScraper
+  VERSION = "0.0.1"
+end

data/lib/sagrone_scraper.rb ADDED Viewed

@@ -0,0 +1,7 @@
+require "sagrone_scraper/version"
+module SagroneScraper
+  def self.version
+    VERSION
+  end
+end

data/sagrone_scraper.gemspec ADDED Viewed

@@ -0,0 +1,29 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'sagrone_scraper/version'
+Gem::Specification.new do |spec|
+  spec.name          = "sagrone_scraper"
+  spec.version       = SagroneScraper::VERSION
+  spec.authors       = ["Marius Colacioiu"]
+  spec.email         = ["marius.colacioiu@gmail.com"]
+  spec.summary       = %q{Sagrone Ruby Scraper.}
+  spec.description   = %q{Simple library to scrap web pages.}
+  spec.homepage      = ""
+  spec.license       = "Apache License 2.0"
+  spec.files         = `git ls-files -z`.split("\x0")
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(spec)/})
+  spec.require_paths = ["lib"]
+  spec.add_dependency "mechanize", "~> 2.0"
+  spec.add_development_dependency "bundler"
+  spec.add_development_dependency "guard-rspec"
+  spec.add_development_dependency "rake"
+  spec.add_development_dependency "rspec"
+  spec.add_development_dependency "webmock"
+end

data/spec/sagrone_scraper/agent_spec.rb ADDED Viewed

@@ -0,0 +1,105 @@
+require 'spec_helper'
+require 'sagrone_scraper/agent'
+RSpec.describe SagroneScraper::Agent do
+  let(:user_agent_aliases) do
+    [ "Linux Firefox", "Linux Mozilla",
+      "Mac Firefox", "Mac Mozilla", "Mac Safari",
+      "Windows Chrome", "Windows IE 8", "Windows IE 9", "Windows Mozilla" ]
+  end
+  describe 'AGENT_ALIASES' do
+    it { expect(described_class::AGENT_ALIASES).to eq(user_agent_aliases) }
+  end
+  describe '.http_client' do
+    subject { described_class.http_client }
+    it { should be_a(Mechanize) }
+    it { should respond_to(:get) }
+    it { expect(subject.user_agent).to match(/Mozilla\/5\.0/) }
+  end
+  describe '#initialize' do
+    describe 'should require exactly one of `url` or `page` option' do
+      before do
+        stub_request_for('http://example.com', 'www.example.com')
+      end
+      it 'when options is empty' do
+        expect { described_class.new }.to raise_error(SagroneScraper::Agent::Error,
+                                                      /Exactly one option must be provided: "url" or "page"/)
+      end
+      it 'when both options are present' do
+        page = Mechanize.new.get('http://example.com')
+        expect {
+          described_class.new(url: 'http://example.com', page: page)
+        }.to raise_error(SagroneScraper::Agent::Error,
+                          /Exactly one option must be provided: "url" or "page"/)
+      end
+    end
+    describe 'with page option' do
+      before do
+        stub_request_for('http://example.com', 'www.example.com')
+      end
+      let(:page) { Mechanize.new.get('http://example.com') }
+      let(:agent) { described_class.new(page: page) }
+      it { expect { agent }.to_not raise_error }
+      it { expect(agent.page).to be }
+      it { expect(agent.url).to eq 'http://example.com/' }
+    end
+    describe 'with invalid URL' do
+      let(:agent) { described_class.new(url: @invalid_url) }
+      it 'should require URL is absolute' do
+        @invalid_url = 'not-a-url'
+        expect { agent }.to raise_error(SagroneScraper::Agent::Error,
+                                        /absolute URL needed \(not not-a-url\)/)
+      end
+      it 'should require absolute path' do
+        @invalid_url = 'http://'
+        webmock_allow do
+          expect { agent }.to raise_error(SagroneScraper::Agent::Error,
+                                          /bad URI\(absolute but no path\)/)
+        end
+      end
+      it 'should require valid URL' do
+        @invalid_url = 'http://example'
+        webmock_allow do
+          expect { agent }.to raise_error(SagroneScraper::Agent::Error,
+                                          /getaddrinfo: nodename nor servname provided, or not known/)
+        end
+      end
+    end
+    describe 'with valid URL' do
+      before do
+        stub_request_for('http://example.com', 'www.example.com')
+      end
+      let(:agent) { described_class.new(url: 'http://example.com') }
+      it { expect(agent.http_client).to be_a(Mechanize) }
+      it { expect(agent.http_client).to equal(agent.http_client) }
+      it { expect { agent }.to_not raise_error }
+      it { expect(agent.url).to eq('http://example.com') }
+      it { expect(agent.page).to be_a(Mechanize::Page) }
+      it { expect(agent.page).to equal(agent.page) }
+      it { expect(agent.page).to respond_to(:at, :body, :title) }
+      it { expect(agent.page).to respond_to(:links, :labels, :images, :image_urls, :forms) }
+    end
+  end
+end

data/spec/sagrone_scraper_spec.rb ADDED Viewed

@@ -0,0 +1,8 @@
+require 'spec_helper'
+require 'sagrone_scraper'
+RSpec.describe SagroneScraper do
+  describe '.version' do
+    it { expect(SagroneScraper.version).to be_a(String) }
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,24 @@
+require 'stub_helper'
+RSpec.configure do |config|
+  config.include(StubHelper)
+  config.expect_with :rspec do |expectations|
+    expectations.include_chain_clauses_in_custom_matcher_descriptions = true
+  end
+  config.mock_with :rspec do |mocks|
+    mocks.verify_partial_doubles = true
+  end
+  config.filter_run :focus
+  config.run_all_when_everything_filtered = true
+  config.disable_monkey_patching!
+  config.warnings = true
+  if config.files_to_run.one?
+    config.default_formatter = 'doc'
+  end
+  config.order = :random
+end

data/spec/stub_helper.rb ADDED Viewed

@@ -0,0 +1,22 @@
+require 'webmock/rspec'
+module StubHelper
+  def stub_request_for(url, file_name)
+    stub_request(:get, url)
+      .to_return({
+        body: get_response_file(file_name),
+        headers: {'content-type' => 'text/html'},
+        status: 200
+      })
+  end
+  def webmock_allow(&block)
+    WebMock.allow_net_connect!
+    block.call
+    WebMock.disable_net_connect!
+  end
+  def get_response_file(name)
+    IO.read(File.join('spec/test_responses', "#{name}"))
+  end
+end

data/spec/test_responses/www.example.com ADDED Viewed

@@ -0,0 +1,51 @@
+<!DOCTYPE html>
+<!-- saved from url=(0019)http://example.com/ -->
+<html><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+    <title>Example Domain</title>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <style type="text/css">
+    body {
+        background-color: #f0f0f2;
+        margin: 0;
+        padding: 0;
+        font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
+    }
+    div {
+        width: 600px;
+        margin: 5em auto;
+        padding: 50px;
+        background-color: #fff;
+        border-radius: 1em;
+    }
+    a:link, a:visited {
+        color: #38488f;
+        text-decoration: none;
+    }
+    @media (max-width: 700px) {
+        body {
+            background-color: #fff;
+        }
+        div {
+            width: auto;
+            margin: 0 auto;
+            border-radius: 0;
+            padding: 1em;
+        }
+    }
+    </style>
+</head>
+<body>
+<div>
+    <h1>Example Domain</h1>
+    <p>This domain is established to be used for illustrative examples in documents. You may use this
+    domain in examples without prior coordination or asking for permission.</p>
+    <p><a href="http://www.iana.org/domains/example">More information...</a></p>
+</div>
+</body></html>

metadata ADDED Viewed

@@ -0,0 +1,152 @@
+--- !ruby/object:Gem::Specification
+name: sagrone_scraper
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Marius Colacioiu
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2015-03-06 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.0'
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: guard-rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: webmock
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+description: Simple library to scrap web pages.
+email:
+- marius.colacioiu@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".editorconfig"
+- ".gitignore"
+- ".rspec"
+- ".travis.yml"
+- CHANGELOG.md
+- Gemfile
+- Guardfile
+- LICENSE
+- README.md
+- Rakefile
+- lib/sagrone_scraper.rb
+- lib/sagrone_scraper/agent.rb
+- lib/sagrone_scraper/version.rb
+- sagrone_scraper.gemspec
+- spec/sagrone_scraper/agent_spec.rb
+- spec/sagrone_scraper_spec.rb
+- spec/spec_helper.rb
+- spec/stub_helper.rb
+- spec/test_responses/www.example.com
+homepage: ''
+licenses:
+- Apache License 2.0
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.2.2
+signing_key:
+specification_version: 4
+summary: Sagrone Ruby Scraper.
+test_files:
+- spec/sagrone_scraper/agent_spec.rb
+- spec/sagrone_scraper_spec.rb
+- spec/spec_helper.rb
+- spec/stub_helper.rb
+- spec/test_responses/www.example.com