scraping 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b9aff8bf11dbde49763faa84c830518cb308daab
4
+ data.tar.gz: ba701a4490c085c2278ff93efbdeb244a9b74b12
5
+ SHA512:
6
+ metadata.gz: b6bc072d8df959e32b8cd5230db684ebffdb08d23ce3a2633a683042eed6f92a5f0c8e61b4c35ecd570975461d417a93f6941b1688d328eafb76cf958f247627
7
+ data.tar.gz: 0149081ebc7cd6073e2a178f10c1904e46a3e3cab8e2fa9f35fc82387b1a7938d4f48428a9eb58d1adc835680c5dc0248b76bd562fb7b1e8288cc24efca46154
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.3.0
5
+ before_install: gem install bundler -v 1.12.3
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in scraping.gemspec
4
+ gemspec
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Ray Zane
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,114 @@
1
+ # Scraping
2
+
3
+ A really simple HTML scraping DSL.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'scraping'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ ## Usage
18
+
19
+ #### A simple example
20
+
21
+ ```ruby
22
+ class Person
23
+ include Scraping
24
+ element :name, 'h1'
25
+ end
26
+
27
+ person = Person.scrape('<h1>Millard Fillmore</h1>')
28
+ person.name #=> 'Millard Fillmore'
29
+ ```
30
+
31
+ #### More complex data structures
32
+
33
+ You can also scrape arrays, objects, and arrays of objects. `elements` and `elements_of` can be deeply nested.
34
+
35
+ ```ruby
36
+ class YouCan
37
+ include Scraping
38
+ elements :scrape, '.scrape'
39
+
40
+ elements :also_scrape, '.also-scrape li' do
41
+ element :name, 'a'
42
+ element :link, 'a/@href'
43
+ elements :numbers, 'span'
44
+ end
45
+
46
+ elements_of :nested_scrape do
47
+ element :data, '.data'
48
+ end
49
+ end
50
+
51
+ you_can = YouCan.scrape(<<-EOF)
52
+ <p class="scrape">
53
+ <span>Arrays</span>
54
+ <span>Too</span>
55
+ </p>
56
+
57
+ <ul class="also-scrape">
58
+ <li>
59
+ <a href="example.com">Meek Mill</a>
60
+ <span>1</span>
61
+ <span>2</span>
62
+ </li>
63
+ <li><a href="test.com">Drake</a></li>
64
+ <ul>
65
+
66
+ <p class="data">Beef</p>
67
+ EOF
68
+
69
+ you_can.scrape #=> ['Arrays', 'Too']
70
+
71
+ you_can.also_scrape.first.name #=> 'Meek Mill'
72
+ you_can.also_scrape.first.link #=> 'example.com'
73
+ you_can.also_scrape.first.numbers #=> ['1', '2']
74
+
75
+ you_can.nested_scrape.data #=> 'Beef'
76
+ ```
77
+
78
+ #### Customizing extraction
79
+
80
+ Any block given to `#element` will allow you to customize the value extracted from the found node.
81
+
82
+ Using `as: :something` would call a method named `#extract_something`.
83
+
84
+ ```ruby
85
+ class Advanced
86
+ element :first_name, '.name' do |node|
87
+ node.text.split(', ').first
88
+ end
89
+
90
+ element :birthday, '.birthday', as: :date
91
+
92
+ private
93
+
94
+ def extract_date(node)
95
+ Date.parse(node.text)
96
+ end
97
+ end
98
+
99
+ advanced = Advanced.new(<<-EOF)
100
+ <h1 class="name">Millard Fillmore</h1>
101
+ <h2 class="birthday">7-1-1800</h2>
102
+ EOF
103
+
104
+ advanced.first_name #=> 'Millard'
105
+ advanced.birthday #=> #<Date: 1800-01-07>
106
+ ```
107
+
108
+ ## Contributing
109
+
110
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/scraping.
111
+
112
+ ## License
113
+
114
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new(:test) do |t|
5
+ t.libs << "test"
6
+ t.libs << "lib"
7
+ t.test_files = FileList['test/**/*_test.rb']
8
+ end
9
+
10
+ task :default => :test
@@ -0,0 +1,55 @@
1
+ require 'nokogiri'
2
+ require 'scraping/version'
3
+ require 'scraping/dsl'
4
+ require 'scraping/rules/element'
5
+ require 'scraping/rules/elements_of'
6
+ require 'scraping/rules/elements'
7
+
8
+ module Scraping
9
+ def self.included(base)
10
+ base.extend ClassMethods
11
+
12
+ base.class_eval do
13
+ attr_reader :page
14
+ end
15
+ end
16
+
17
+ def initialize(page)
18
+ @page = page
19
+ end
20
+
21
+ def scrape
22
+ self.class.rules.each do |name, rule|
23
+ public_send("#{name}=", rule.call(self, page))
24
+ end
25
+ end
26
+
27
+ module ClassMethods
28
+ include DSL
29
+
30
+ # Make the rules inheritable, but prevent mutation
31
+ # of the original hash
32
+ def inherited(subclass)
33
+ subclass.instance_variable_set(:@rules, rules.clone)
34
+ end
35
+
36
+ def element(name, *)
37
+ attr_accessor name
38
+ super
39
+ end
40
+
41
+ def elements_of(name)
42
+ attr_accessor name
43
+ super
44
+ end
45
+
46
+ def elements(name, *)
47
+ attr_accessor name
48
+ super
49
+ end
50
+
51
+ def scrape(html)
52
+ new(Nokogiri::HTML(html)).tap(&:scrape)
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,19 @@
1
+ module Scraping
2
+ module DSL
3
+ def rules
4
+ @rules ||= {}
5
+ end
6
+
7
+ def element(name, selector, options = {}, &block)
8
+ rules[name] = Rules::Element.new(name, selector, options, &block)
9
+ end
10
+
11
+ def elements_of(name, &block)
12
+ rules[name] = Rules::ElementsOf.new(name).evaluate(&block)
13
+ end
14
+
15
+ def elements(name, selector, options = {}, &block)
16
+ rules[name] = Rules::Elements.new(name, selector, options).evaluate(&block)
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,26 @@
1
+ module Scraping
2
+ module Rules
3
+ class Element
4
+ attr_reader :name, :selector, :options, :extract
5
+
6
+ def initialize(name, selector, options = {}, &extract)
7
+ @name = name
8
+ @selector = selector
9
+ @options = options
10
+ @extract = extract if block_given?
11
+ end
12
+
13
+ def call(scraper, node)
14
+ item = node.at(selector)
15
+
16
+ if item && options[:as]
17
+ scraper.send("extract_#{options[:as]}", item)
18
+ elsif item && extract
19
+ scraper.instance_exec(item, &extract)
20
+ elsif item
21
+ item.text
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,29 @@
1
+ module Scraping
2
+ module Rules
3
+ class Elements
4
+ attr_reader :name, :selector, :rule, :options
5
+
6
+ def initialize(name, selector, options = {})
7
+ @name = name
8
+ @selector = selector
9
+ @options = options
10
+ end
11
+
12
+ def evaluate(&block)
13
+ if block_given?
14
+ @rule = ElementsOf.new(name).evaluate(&block)
15
+ else
16
+ @rule = Element.new(name, '.', options)
17
+ end
18
+
19
+ self
20
+ end
21
+
22
+ def call(scraper, node)
23
+ node.search(selector).map do |item|
24
+ rule.call(scraper, item)
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,26 @@
1
+ require 'ostruct'
2
+
3
+ module Scraping
4
+ module Rules
5
+ class ElementsOf
6
+ include DSL
7
+ attr_reader :name
8
+
9
+ def initialize(name)
10
+ @name = name
11
+ end
12
+
13
+ def evaluate(&block)
14
+ instance_eval(&block)
15
+ self
16
+ end
17
+
18
+ def call(scraper, node)
19
+ rules.inject(OpenStruct.new) do |obj, (name, rule)|
20
+ obj[name] = rule.call(scraper, node)
21
+ obj
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,3 @@
1
+ module Scraping
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'scraping/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "scraping"
8
+ spec.version = Scraping::VERSION
9
+ spec.authors = ["Ray Zane"]
10
+ spec.email = ["ray@promptworks.com"]
11
+
12
+ spec.summary = %q{A DSL for scraping HTML into objects}
13
+ spec.description = %q{A DSL for scraping HTML into objects}
14
+ spec.homepage = "https://github.com/rzane/scraping"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_dependency 'nokogiri'
23
+
24
+ spec.add_development_dependency "bundler", "~> 1.12"
25
+ spec.add_development_dependency "rake", "~> 10.0"
26
+ spec.add_development_dependency "minitest", "~> 5.0"
27
+ end
metadata ADDED
@@ -0,0 +1,113 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scraping
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ray Zane
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-05-25 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.12'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.12'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '5.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '5.0'
69
+ description: A DSL for scraping HTML into objects
70
+ email:
71
+ - ray@promptworks.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - ".gitignore"
77
+ - ".travis.yml"
78
+ - Gemfile
79
+ - LICENSE.txt
80
+ - README.md
81
+ - Rakefile
82
+ - lib/scraping.rb
83
+ - lib/scraping/dsl.rb
84
+ - lib/scraping/rules/element.rb
85
+ - lib/scraping/rules/elements.rb
86
+ - lib/scraping/rules/elements_of.rb
87
+ - lib/scraping/version.rb
88
+ - scraping.gemspec
89
+ homepage: https://github.com/rzane/scraping
90
+ licenses:
91
+ - MIT
92
+ metadata: {}
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ requirements: []
108
+ rubyforge_project:
109
+ rubygems_version: 2.4.8
110
+ signing_key:
111
+ specification_version: 4
112
+ summary: A DSL for scraping HTML into objects
113
+ test_files: []