skrape 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a431f8709c3f19242645f7f2221392417f9a089a
4
+ data.tar.gz: 878dd97f1c89db56dbbf78db16433ab05deb5e3c
5
+ SHA512:
6
+ metadata.gz: 37f91a440aeb8e90e82e33b069888e9e25dfedbceaef6243666cc6e824bf7da169cac57d28e8724a9393766f0549860fbb85d5a7562f4e4278abbad476140e43
7
+ data.tar.gz: 543871c6fa15105834452047c1a0076a853d2d7b92487c8d59fafd30a87bee2dd3df3bfdd9069cce1cf319f764f54eeefd2bbac5dbaccb0190828140ce71725b
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Mike Williamson
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,57 @@
1
+ # Skrape
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'skrape'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install skrape
18
+
19
+ ## Usage
20
+
21
+ Skrape provides a cute DSL for extracting information from pages on the
22
+ web. You give it a url and a block and it gives you back a hash.
23
+
24
+ Lets say you have a page like this:
25
+
26
+ <html><body><h1>I am a title</h1></body></html>
27
+
28
+ And you want a hash like this:
29
+
30
+ {title: "I am a title"}
31
+
32
+ You can run Skrape like this:
33
+
34
+ results = Skrape::Page.new("http://example.com").extract do
35
+ extract_title with: 'h1'
36
+ end
37
+
38
+ Skrape will give you the elements text by default, but in cases where
39
+ you are after something else, or need to do some massaging of the data,
40
+ you can pass a block:
41
+
42
+ results = Skrape::Page.new(url).extract do
43
+ extract_link_href with: 'a', and_run: proc {|link| link.attr('href').value }
44
+ end
45
+
46
+ The element(s) will be passed into the block as a
47
+ Nokogiri::XML::NodeSet for you to play with. Whatever text you return
48
+ will be added to the hash of things to return.
49
+
50
+
51
+ ## Contributing
52
+
53
+ 1. Fork it ( http://github.com/sleepycat/skrape/fork )
54
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
55
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
56
+ 4. Push to the branch (`git push origin my-new-feature`)
57
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,3 @@
1
+ module Skrape
2
+ VERSION = "0.0.1"
3
+ end
data/lib/skrape.rb ADDED
@@ -0,0 +1,37 @@
1
+ require "skrape/version"
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+ require 'pry'
5
+
6
+ module Skrape
7
+
8
+ class NoElementsFoundError < StandardError; end
9
+
10
+ class Page
11
+
12
+ def initialize url
13
+ @extracted_info = {}
14
+ agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/32.0.1700.102 Chrome/32.0.1700.102 Safari/537.36"
15
+ @document = Nokogiri::HTML(open(url, "User-Agent" => agent))
16
+ end
17
+
18
+ def extract
19
+ block = Proc.new
20
+ instance_eval &block
21
+ @extracted_info
22
+ end
23
+
24
+ def method_missing name, args
25
+ feature_name = name.to_s.gsub('extract_', '').to_sym
26
+ element = @document.css args[:with]
27
+ raise NoElementsFoundError, "the css selector for '#{feature_name}' did not return anything" if element.empty?
28
+ if args[:and_run]
29
+ @extracted_info[feature_name] = args[:and_run].call(element)
30
+ else
31
+ @extracted_info[feature_name] = element.text
32
+ end
33
+ end
34
+
35
+ end
36
+
37
+ end
data/skrape.gemspec ADDED
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'skrape/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "skrape"
8
+ spec.version = Skrape::VERSION
9
+ spec.authors = ["Mike Williamson"]
10
+ spec.email = ["mike@korora.ca"]
11
+ spec.summary = "A tiny DSL for web scraping."
12
+ spec.description = "A cute little DSL for picking information off of web pages."
13
+ spec.homepage = "https://github.com/sleepycat/skrape"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.5"
22
+ spec.add_development_dependency "rspec", "~> 2.14"
23
+ spec.add_development_dependency "webmock", "~> 1.7"
24
+ spec.add_development_dependency "pry", "~> 0.9"
25
+ spec.add_dependency "nokogiri", "~> 1.6"
26
+ end
@@ -0,0 +1,39 @@
1
+ require 'spec_helper'
2
+
3
+ describe Skrape do
4
+
5
+ describe Skrape::Page do
6
+
7
+ let(:url){ "http://example.com" }
8
+ let(:example_response){ File.new('spec/test_data/example_com_raw_response') }
9
+
10
+ before(:each) do
11
+ stub_request(:get, url).to_return(example_response)
12
+ end
13
+
14
+ it "returns the text of the element identifed by the CSS selector" do
15
+ results = Skrape::Page.new(url).extract do
16
+ extract_title with: 'h1'
17
+ end
18
+ expect(results[:title]).to eq "Example Domain"
19
+ end
20
+
21
+ it "accepts a block so you can do more sophisticated things" do
22
+ results = Skrape::Page.new(url).extract do
23
+ extract_link_href with: 'a', and_run: proc {|link| link.attr('href').value }
24
+ end
25
+ expect(results[:link_href]).to eq "http://www.iana.org/domains/example"
26
+ end
27
+
28
+ it "raises a helpful error when the CSS selector returns nothing" do
29
+ expect{
30
+ Skrape::Page.new(url).extract do
31
+ extract_nothing with: 'foo'
32
+ end
33
+ }.to raise_error Skrape::NoElementsFoundError
34
+ end
35
+
36
+ end
37
+
38
+ end
39
+
@@ -0,0 +1,13 @@
1
+ require_relative '../lib/skrape'
2
+ require 'webmock/rspec'
3
+
4
+ RSpec.configure do |config|
5
+ # Use color in STDOUT
6
+ config.color_enabled = true
7
+
8
+ # Use color not only in STDOUT but also in pagers and files
9
+ config.tty = true
10
+
11
+ # Use the specified formatter
12
+ config.formatter = :documentation # :progress, :html, :textmate
13
+ end
@@ -0,0 +1,63 @@
1
+ HTTP/1.1 200 OK
2
+ Accept-Ranges: bytes
3
+ Cache-Control: max-age=604800
4
+ Content-Type: text/html
5
+ Date: Tue, 11 Feb 2014 15:50:08 GMT
6
+ Etag: "359670651"
7
+ Expires: Tue, 18 Feb 2014 15:50:08 GMT
8
+ Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT
9
+ Server: ECS (iad/19AB)
10
+ X-Cache: HIT
11
+ x-ec-custom-error: 1
12
+ Content-Length: 1270
13
+
14
+ <!doctype html>
15
+ <html>
16
+ <head>
17
+ <title>Example Domain</title>
18
+
19
+ <meta charset="utf-8" />
20
+ <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
21
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
22
+ <style type="text/css">
23
+ body {
24
+ background-color: #f0f0f2;
25
+ margin: 0;
26
+ padding: 0;
27
+ font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
28
+
29
+ }
30
+ div {
31
+ width: 600px;
32
+ margin: 5em auto;
33
+ padding: 50px;
34
+ background-color: #fff;
35
+ border-radius: 1em;
36
+ }
37
+ a:link, a:visited {
38
+ color: #38488f;
39
+ text-decoration: none;
40
+ }
41
+ @media (max-width: 700px) {
42
+ body {
43
+ background-color: #fff;
44
+ }
45
+ div {
46
+ width: auto;
47
+ margin: 0 auto;
48
+ border-radius: 0;
49
+ padding: 1em;
50
+ }
51
+ }
52
+ </style>
53
+ </head>
54
+
55
+ <body>
56
+ <div>
57
+ <h1>Example Domain</h1>
58
+ <p>This domain is established to be used for illustrative examples in documents. You may use this
59
+ domain in examples without prior coordination or asking for permission.</p>
60
+ <p><a href="http://www.iana.org/domains/example">More information...</a></p>
61
+ </div>
62
+ </body>
63
+ </html>
metadata ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: skrape
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Mike Williamson
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2014-02-12 00:00:00 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bundler
16
+ prerelease: false
17
+ requirement: &id001 !ruby/object:Gem::Requirement
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: "1.5"
22
+ type: :development
23
+ version_requirements: *id001
24
+ - !ruby/object:Gem::Dependency
25
+ name: rspec
26
+ prerelease: false
27
+ requirement: &id002 !ruby/object:Gem::Requirement
28
+ requirements:
29
+ - - ~>
30
+ - !ruby/object:Gem::Version
31
+ version: "2.14"
32
+ type: :development
33
+ version_requirements: *id002
34
+ - !ruby/object:Gem::Dependency
35
+ name: webmock
36
+ prerelease: false
37
+ requirement: &id003 !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ~>
40
+ - !ruby/object:Gem::Version
41
+ version: "1.7"
42
+ type: :development
43
+ version_requirements: *id003
44
+ - !ruby/object:Gem::Dependency
45
+ name: pry
46
+ prerelease: false
47
+ requirement: &id004 !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ~>
50
+ - !ruby/object:Gem::Version
51
+ version: "0.9"
52
+ type: :development
53
+ version_requirements: *id004
54
+ - !ruby/object:Gem::Dependency
55
+ name: nokogiri
56
+ prerelease: false
57
+ requirement: &id005 !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: "1.6"
62
+ type: :runtime
63
+ version_requirements: *id005
64
+ description: A cute little DSL for picking information off of web pages.
65
+ email:
66
+ - mike@korora.ca
67
+ executables: []
68
+
69
+ extensions: []
70
+
71
+ extra_rdoc_files: []
72
+
73
+ files:
74
+ - .gitignore
75
+ - Gemfile
76
+ - LICENSE.txt
77
+ - README.md
78
+ - Rakefile
79
+ - lib/skrape.rb
80
+ - lib/skrape/version.rb
81
+ - skrape.gemspec
82
+ - spec/skrape_spec.rb
83
+ - spec/spec_helper.rb
84
+ - spec/test_data/example_com_raw_response
85
+ homepage: https://github.com/sleepycat/skrape
86
+ licenses:
87
+ - MIT
88
+ metadata: {}
89
+
90
+ post_install_message:
91
+ rdoc_options: []
92
+
93
+ require_paths:
94
+ - lib
95
+ required_ruby_version: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - &id006
98
+ - ">="
99
+ - !ruby/object:Gem::Version
100
+ version: "0"
101
+ required_rubygems_version: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - *id006
104
+ requirements: []
105
+
106
+ rubyforge_project:
107
+ rubygems_version: 2.2.2
108
+ signing_key:
109
+ specification_version: 4
110
+ summary: A tiny DSL for web scraping.
111
+ test_files:
112
+ - spec/skrape_spec.rb
113
+ - spec/spec_helper.rb
114
+ - spec/test_data/example_com_raw_response