scraping 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b9aff8bf11dbde49763faa84c830518cb308daab
4
+ data.tar.gz: ba701a4490c085c2278ff93efbdeb244a9b74b12
5
+ SHA512:
6
+ metadata.gz: b6bc072d8df959e32b8cd5230db684ebffdb08d23ce3a2633a683042eed6f92a5f0c8e61b4c35ecd570975461d417a93f6941b1688d328eafb76cf958f247627
7
+ data.tar.gz: 0149081ebc7cd6073e2a178f10c1904e46a3e3cab8e2fa9f35fc82387b1a7938d4f48428a9eb58d1adc835680c5dc0248b76bd562fb7b1e8288cc24efca46154
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.3.0
5
+ before_install: gem install bundler -v 1.12.3
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in scraping.gemspec
4
+ gemspec
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Ray Zane
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,114 @@
1
+ # Scraping
2
+
3
+ A really simple HTML scraping DSL.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'scraping'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ ## Usage
18
+
19
+ #### A simple example
20
+
21
+ ```ruby
22
+ class Person
23
+ include Scraping
24
+ element :name, 'h1'
25
+ end
26
+
27
+ person = Person.scrape('<h1>Millard Fillmore</h1>')
28
+ person.name #=> 'Millard Fillmore'
29
+ ```
30
+
31
+ #### More complex data structures
32
+
33
+ You can also scrape arrays, objects, and arrays of objects. `elements` and `elements_of` can be deeply nested.
34
+
35
+ ```ruby
36
+ class YouCan
37
+ include Scraping
38
+ elements :scrape, '.scrape'
39
+
40
+ elements :also_scrape, '.also-scrape li' do
41
+ element :name, 'a'
42
+ element :link, 'a/@href'
43
+ elements :numbers, 'span'
44
+ end
45
+
46
+ elements_of :nested_scrape do
47
+ element :data, '.data'
48
+ end
49
+ end
50
+
51
+ you_can = YouCan.scrape(<<-EOF)
52
+ <p class="scrape">
53
+ <span>Arrays</span>
54
+ <span>Too</span>
55
+ </p>
56
+
57
+ <ul class="also-scrape">
58
+ <li>
59
+ <a href="example.com">Meek Mill</a>
60
+ <span>1</span>
61
+ <span>2</span>
62
+ </li>
63
+ <li><a href="test.com">Drake</a></li>
64
+ <ul>
65
+
66
+ <p class="data">Beef</p>
67
+ EOF
68
+
69
+ you_can.scrape #=> ['Arrays', 'Too']
70
+
71
+ you_can.also_scrape.first.name #=> 'Meek Mill'
72
+ you_can.also_scrape.first.link #=> 'example.com'
73
+ you_can.also_scrape.first.numbers #=> ['1', '2']
74
+
75
+ you_can.nested_scrape.data #=> 'Beef'
76
+ ```
77
+
78
+ #### Customizing extraction
79
+
80
+ Any block given to `#element` will allow you to customize the value extracted from the found node.
81
+
82
+ Using `as: :something` would call a method named `#extract_something`.
83
+
84
+ ```ruby
85
+ class Advanced
86
+ element :first_name, '.name' do |node|
87
+ node.text.split(', ').first
88
+ end
89
+
90
+ element :birthday, '.birthday', as: :date
91
+
92
+ private
93
+
94
+ def extract_date(node)
95
+ Date.parse(node.text)
96
+ end
97
+ end
98
+
99
+ advanced = Advanced.new(<<-EOF)
100
+ <h1 class="name">Millard Fillmore</h1>
101
+ <h2 class="birthday">7-1-1800</h2>
102
+ EOF
103
+
104
+ advanced.first_name #=> 'Millard'
105
+ advanced.birthday #=> #<Date: 1800-01-07>
106
+ ```
107
+
108
+ ## Contributing
109
+
110
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/scraping.
111
+
112
+ ## License
113
+
114
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new(:test) do |t|
5
+ t.libs << "test"
6
+ t.libs << "lib"
7
+ t.test_files = FileList['test/**/*_test.rb']
8
+ end
9
+
10
+ task :default => :test
@@ -0,0 +1,55 @@
1
+ require 'nokogiri'
2
+ require 'scraping/version'
3
+ require 'scraping/dsl'
4
+ require 'scraping/rules/element'
5
+ require 'scraping/rules/elements_of'
6
+ require 'scraping/rules/elements'
7
+
8
+ module Scraping
9
+ def self.included(base)
10
+ base.extend ClassMethods
11
+
12
+ base.class_eval do
13
+ attr_reader :page
14
+ end
15
+ end
16
+
17
+ def initialize(page)
18
+ @page = page
19
+ end
20
+
21
+ def scrape
22
+ self.class.rules.each do |name, rule|
23
+ public_send("#{name}=", rule.call(self, page))
24
+ end
25
+ end
26
+
27
+ module ClassMethods
28
+ include DSL
29
+
30
+ # Make the rules inheritable, but prevent mutation
31
+ # of the original hash
32
+ def inherited(subclass)
33
+ subclass.instance_variable_set(:@rules, rules.clone)
34
+ end
35
+
36
+ def element(name, *)
37
+ attr_accessor name
38
+ super
39
+ end
40
+
41
+ def elements_of(name)
42
+ attr_accessor name
43
+ super
44
+ end
45
+
46
+ def elements(name, *)
47
+ attr_accessor name
48
+ super
49
+ end
50
+
51
+ def scrape(html)
52
+ new(Nokogiri::HTML(html)).tap(&:scrape)
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,19 @@
1
+ module Scraping
2
+ module DSL
3
+ def rules
4
+ @rules ||= {}
5
+ end
6
+
7
+ def element(name, selector, options = {}, &block)
8
+ rules[name] = Rules::Element.new(name, selector, options, &block)
9
+ end
10
+
11
+ def elements_of(name, &block)
12
+ rules[name] = Rules::ElementsOf.new(name).evaluate(&block)
13
+ end
14
+
15
+ def elements(name, selector, options = {}, &block)
16
+ rules[name] = Rules::Elements.new(name, selector, options).evaluate(&block)
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,26 @@
1
+ module Scraping
2
+ module Rules
3
+ class Element
4
+ attr_reader :name, :selector, :options, :extract
5
+
6
+ def initialize(name, selector, options = {}, &extract)
7
+ @name = name
8
+ @selector = selector
9
+ @options = options
10
+ @extract = extract if block_given?
11
+ end
12
+
13
+ def call(scraper, node)
14
+ item = node.at(selector)
15
+
16
+ if item && options[:as]
17
+ scraper.send("extract_#{options[:as]}", item)
18
+ elsif item && extract
19
+ scraper.instance_exec(item, &extract)
20
+ elsif item
21
+ item.text
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,29 @@
1
+ module Scraping
2
+ module Rules
3
+ class Elements
4
+ attr_reader :name, :selector, :rule, :options
5
+
6
+ def initialize(name, selector, options = {})
7
+ @name = name
8
+ @selector = selector
9
+ @options = options
10
+ end
11
+
12
+ def evaluate(&block)
13
+ if block_given?
14
+ @rule = ElementsOf.new(name).evaluate(&block)
15
+ else
16
+ @rule = Element.new(name, '.', options)
17
+ end
18
+
19
+ self
20
+ end
21
+
22
+ def call(scraper, node)
23
+ node.search(selector).map do |item|
24
+ rule.call(scraper, item)
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,26 @@
1
+ require 'ostruct'
2
+
3
+ module Scraping
4
+ module Rules
5
+ class ElementsOf
6
+ include DSL
7
+ attr_reader :name
8
+
9
+ def initialize(name)
10
+ @name = name
11
+ end
12
+
13
+ def evaluate(&block)
14
+ instance_eval(&block)
15
+ self
16
+ end
17
+
18
+ def call(scraper, node)
19
+ rules.inject(OpenStruct.new) do |obj, (name, rule)|
20
+ obj[name] = rule.call(scraper, node)
21
+ obj
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,3 @@
1
+ module Scraping
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'scraping/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "scraping"
8
+ spec.version = Scraping::VERSION
9
+ spec.authors = ["Ray Zane"]
10
+ spec.email = ["ray@promptworks.com"]
11
+
12
+ spec.summary = %q{A DSL for scraping HTML into objects}
13
+ spec.description = %q{A DSL for scraping HTML into objects}
14
+ spec.homepage = "https://github.com/rzane/scraping"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_dependency 'nokogiri'
23
+
24
+ spec.add_development_dependency "bundler", "~> 1.12"
25
+ spec.add_development_dependency "rake", "~> 10.0"
26
+ spec.add_development_dependency "minitest", "~> 5.0"
27
+ end
metadata ADDED
@@ -0,0 +1,113 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scraping
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ray Zane
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-05-25 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.12'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.12'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '5.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '5.0'
69
+ description: A DSL for scraping HTML into objects
70
+ email:
71
+ - ray@promptworks.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - ".gitignore"
77
+ - ".travis.yml"
78
+ - Gemfile
79
+ - LICENSE.txt
80
+ - README.md
81
+ - Rakefile
82
+ - lib/scraping.rb
83
+ - lib/scraping/dsl.rb
84
+ - lib/scraping/rules/element.rb
85
+ - lib/scraping/rules/elements.rb
86
+ - lib/scraping/rules/elements_of.rb
87
+ - lib/scraping/version.rb
88
+ - scraping.gemspec
89
+ homepage: https://github.com/rzane/scraping
90
+ licenses:
91
+ - MIT
92
+ metadata: {}
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ requirements: []
108
+ rubyforge_project:
109
+ rubygems_version: 2.4.8
110
+ signing_key:
111
+ specification_version: 4
112
+ summary: A DSL for scraping HTML into objects
113
+ test_files: []