skrape 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a431f8709c3f19242645f7f2221392417f9a089a
4
+ data.tar.gz: 878dd97f1c89db56dbbf78db16433ab05deb5e3c
5
+ SHA512:
6
+ metadata.gz: 37f91a440aeb8e90e82e33b069888e9e25dfedbceaef6243666cc6e824bf7da169cac57d28e8724a9393766f0549860fbb85d5a7562f4e4278abbad476140e43
7
+ data.tar.gz: 543871c6fa15105834452047c1a0076a853d2d7b92487c8d59fafd30a87bee2dd3df3bfdd9069cce1cf319f764f54eeefd2bbac5dbaccb0190828140ce71725b
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Mike Williamson
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,57 @@
1
+ # Skrape
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'skrape'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install skrape
18
+
19
+ ## Usage
20
+
21
+ Skrape provides a cute DSL for extracting information from pages on the
22
+ web. You give it a url and a block and it gives you back a hash.
23
+
24
+ Lets say you have a page like this:
25
+
26
+ <html><body><h1>I am a title</h1></body></html>
27
+
28
+ And you want a hash like this:
29
+
30
+ {title: "I am a title"}
31
+
32
+ You can run Skrape like this:
33
+
34
+ results = Skrape::Page.new("http://example.com").extract do
35
+ extract_title with: 'h1'
36
+ end
37
+
38
+ Skrape will give you the elements text by default, but in cases where
39
+ you are after something else, or need to do some massaging of the data,
40
+ you can pass a block:
41
+
42
+ results = Skrape::Page.new(url).extract do
43
+ extract_link_href with: 'a', and_run: proc {|link| link.attr('href').value }
44
+ end
45
+
46
+ The element(s) will be passed into the block as a
47
+ Nokogiri::XML::NodeSet for you to play with. Whatever text you return
48
+ will be added to the hash of things to return.
49
+
50
+
51
+ ## Contributing
52
+
53
+ 1. Fork it ( http://github.com/sleepycat/skrape/fork )
54
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
55
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
56
+ 4. Push to the branch (`git push origin my-new-feature`)
57
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,3 @@
1
+ module Skrape
2
+ VERSION = "0.0.1"
3
+ end
data/lib/skrape.rb ADDED
@@ -0,0 +1,37 @@
1
+ require "skrape/version"
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+ require 'pry'
5
+
6
+ module Skrape
7
+
8
+ class NoElementsFoundError < StandardError; end
9
+
10
+ class Page
11
+
12
+ def initialize url
13
+ @extracted_info = {}
14
+ agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/32.0.1700.102 Chrome/32.0.1700.102 Safari/537.36"
15
+ @document = Nokogiri::HTML(open(url, "User-Agent" => agent))
16
+ end
17
+
18
+ def extract
19
+ block = Proc.new
20
+ instance_eval &block
21
+ @extracted_info
22
+ end
23
+
24
+ def method_missing name, args
25
+ feature_name = name.to_s.gsub('extract_', '').to_sym
26
+ element = @document.css args[:with]
27
+ raise NoElementsFoundError, "the css selector for '#{feature_name}' did not return anything" if element.empty?
28
+ if args[:and_run]
29
+ @extracted_info[feature_name] = args[:and_run].call(element)
30
+ else
31
+ @extracted_info[feature_name] = element.text
32
+ end
33
+ end
34
+
35
+ end
36
+
37
+ end
data/skrape.gemspec ADDED
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'skrape/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "skrape"
8
+ spec.version = Skrape::VERSION
9
+ spec.authors = ["Mike Williamson"]
10
+ spec.email = ["mike@korora.ca"]
11
+ spec.summary = "A tiny DSL for web scraping."
12
+ spec.description = "A cute little DSL for picking information off of web pages."
13
+ spec.homepage = "https://github.com/sleepycat/skrape"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.5"
22
+ spec.add_development_dependency "rspec", "~> 2.14"
23
+ spec.add_development_dependency "webmock", "~> 1.7"
24
+ spec.add_development_dependency "pry", "~> 0.9"
25
+ spec.add_dependency "nokogiri", "~> 1.6"
26
+ end
@@ -0,0 +1,39 @@
1
+ require 'spec_helper'
2
+
3
+ describe Skrape do
4
+
5
+ describe Skrape::Page do
6
+
7
+ let(:url){ "http://example.com" }
8
+ let(:example_response){ File.new('spec/test_data/example_com_raw_response') }
9
+
10
+ before(:each) do
11
+ stub_request(:get, url).to_return(example_response)
12
+ end
13
+
14
+ it "returns the text of the element identifed by the CSS selector" do
15
+ results = Skrape::Page.new(url).extract do
16
+ extract_title with: 'h1'
17
+ end
18
+ expect(results[:title]).to eq "Example Domain"
19
+ end
20
+
21
+ it "accepts a block so you can do more sophisticated things" do
22
+ results = Skrape::Page.new(url).extract do
23
+ extract_link_href with: 'a', and_run: proc {|link| link.attr('href').value }
24
+ end
25
+ expect(results[:link_href]).to eq "http://www.iana.org/domains/example"
26
+ end
27
+
28
+ it "raises a helpful error when the CSS selector returns nothing" do
29
+ expect{
30
+ Skrape::Page.new(url).extract do
31
+ extract_nothing with: 'foo'
32
+ end
33
+ }.to raise_error Skrape::NoElementsFoundError
34
+ end
35
+
36
+ end
37
+
38
+ end
39
+
@@ -0,0 +1,13 @@
1
+ require_relative '../lib/skrape'
2
+ require 'webmock/rspec'
3
+
4
+ RSpec.configure do |config|
5
+ # Use color in STDOUT
6
+ config.color_enabled = true
7
+
8
+ # Use color not only in STDOUT but also in pagers and files
9
+ config.tty = true
10
+
11
+ # Use the specified formatter
12
+ config.formatter = :documentation # :progress, :html, :textmate
13
+ end
@@ -0,0 +1,63 @@
1
+ HTTP/1.1 200 OK
2
+ Accept-Ranges: bytes
3
+ Cache-Control: max-age=604800
4
+ Content-Type: text/html
5
+ Date: Tue, 11 Feb 2014 15:50:08 GMT
6
+ Etag: "359670651"
7
+ Expires: Tue, 18 Feb 2014 15:50:08 GMT
8
+ Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT
9
+ Server: ECS (iad/19AB)
10
+ X-Cache: HIT
11
+ x-ec-custom-error: 1
12
+ Content-Length: 1270
13
+
14
+ <!doctype html>
15
+ <html>
16
+ <head>
17
+ <title>Example Domain</title>
18
+
19
+ <meta charset="utf-8" />
20
+ <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
21
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
22
+ <style type="text/css">
23
+ body {
24
+ background-color: #f0f0f2;
25
+ margin: 0;
26
+ padding: 0;
27
+ font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
28
+
29
+ }
30
+ div {
31
+ width: 600px;
32
+ margin: 5em auto;
33
+ padding: 50px;
34
+ background-color: #fff;
35
+ border-radius: 1em;
36
+ }
37
+ a:link, a:visited {
38
+ color: #38488f;
39
+ text-decoration: none;
40
+ }
41
+ @media (max-width: 700px) {
42
+ body {
43
+ background-color: #fff;
44
+ }
45
+ div {
46
+ width: auto;
47
+ margin: 0 auto;
48
+ border-radius: 0;
49
+ padding: 1em;
50
+ }
51
+ }
52
+ </style>
53
+ </head>
54
+
55
+ <body>
56
+ <div>
57
+ <h1>Example Domain</h1>
58
+ <p>This domain is established to be used for illustrative examples in documents. You may use this
59
+ domain in examples without prior coordination or asking for permission.</p>
60
+ <p><a href="http://www.iana.org/domains/example">More information...</a></p>
61
+ </div>
62
+ </body>
63
+ </html>
metadata ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: skrape
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Mike Williamson
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2014-02-12 00:00:00 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bundler
16
+ prerelease: false
17
+ requirement: &id001 !ruby/object:Gem::Requirement
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: "1.5"
22
+ type: :development
23
+ version_requirements: *id001
24
+ - !ruby/object:Gem::Dependency
25
+ name: rspec
26
+ prerelease: false
27
+ requirement: &id002 !ruby/object:Gem::Requirement
28
+ requirements:
29
+ - - ~>
30
+ - !ruby/object:Gem::Version
31
+ version: "2.14"
32
+ type: :development
33
+ version_requirements: *id002
34
+ - !ruby/object:Gem::Dependency
35
+ name: webmock
36
+ prerelease: false
37
+ requirement: &id003 !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ~>
40
+ - !ruby/object:Gem::Version
41
+ version: "1.7"
42
+ type: :development
43
+ version_requirements: *id003
44
+ - !ruby/object:Gem::Dependency
45
+ name: pry
46
+ prerelease: false
47
+ requirement: &id004 !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ~>
50
+ - !ruby/object:Gem::Version
51
+ version: "0.9"
52
+ type: :development
53
+ version_requirements: *id004
54
+ - !ruby/object:Gem::Dependency
55
+ name: nokogiri
56
+ prerelease: false
57
+ requirement: &id005 !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: "1.6"
62
+ type: :runtime
63
+ version_requirements: *id005
64
+ description: A cute little DSL for picking information off of web pages.
65
+ email:
66
+ - mike@korora.ca
67
+ executables: []
68
+
69
+ extensions: []
70
+
71
+ extra_rdoc_files: []
72
+
73
+ files:
74
+ - .gitignore
75
+ - Gemfile
76
+ - LICENSE.txt
77
+ - README.md
78
+ - Rakefile
79
+ - lib/skrape.rb
80
+ - lib/skrape/version.rb
81
+ - skrape.gemspec
82
+ - spec/skrape_spec.rb
83
+ - spec/spec_helper.rb
84
+ - spec/test_data/example_com_raw_response
85
+ homepage: https://github.com/sleepycat/skrape
86
+ licenses:
87
+ - MIT
88
+ metadata: {}
89
+
90
+ post_install_message:
91
+ rdoc_options: []
92
+
93
+ require_paths:
94
+ - lib
95
+ required_ruby_version: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - &id006
98
+ - ">="
99
+ - !ruby/object:Gem::Version
100
+ version: "0"
101
+ required_rubygems_version: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - *id006
104
+ requirements: []
105
+
106
+ rubyforge_project:
107
+ rubygems_version: 2.2.2
108
+ signing_key:
109
+ specification_version: 4
110
+ summary: A tiny DSL for web scraping.
111
+ test_files:
112
+ - spec/skrape_spec.rb
113
+ - spec/spec_helper.rb
114
+ - spec/test_data/example_com_raw_response