scraping 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b9aff8bf11dbde49763faa84c830518cb308daab
4
- data.tar.gz: ba701a4490c085c2278ff93efbdeb244a9b74b12
3
+ metadata.gz: d078337c4224a96b2587427a06556877d69eb656
4
+ data.tar.gz: dc8bdf3cc7a53568c99d869dfdda769c17c79d7b
5
5
  SHA512:
6
- metadata.gz: b6bc072d8df959e32b8cd5230db684ebffdb08d23ce3a2633a683042eed6f92a5f0c8e61b4c35ecd570975461d417a93f6941b1688d328eafb76cf958f247627
7
- data.tar.gz: 0149081ebc7cd6073e2a178f10c1904e46a3e3cab8e2fa9f35fc82387b1a7938d4f48428a9eb58d1adc835680c5dc0248b76bd562fb7b1e8288cc24efca46154
6
+ metadata.gz: ca76e4ac712fc15fea39a3f7d76298bf20e651410ec8a515a63345d4563740e2f59caede6f6b0a314cbb7335c7a72888c336ef0ef18bd32ee10bfe7fc21da921
7
+ data.tar.gz: c42df98a5a21a72b6cb7397dd34a099ec40ef36567c3f7a7b76dfcb3a4a864c8fe034276dd6ecf53d347e442292e939471c43ac0704d4080c302062f40dfdcb9
data/README.md CHANGED
@@ -35,15 +35,16 @@ You can also scrape arrays, objects, and arrays of objects. `elements` and `elem
35
35
  ```ruby
36
36
  class YouCan
37
37
  include Scraping
38
+
38
39
  elements :scrape, '.scrape'
39
40
 
40
- elements :also_scrape, '.also-scrape li' do
41
+ sections :also_scrape, '.also-scrape li' do
41
42
  element :name, 'a'
42
43
  element :link, 'a/@href'
43
44
  elements :numbers, 'span'
44
45
  end
45
46
 
46
- elements_of :nested_scrape do
47
+ section :nested_scrape do
47
48
  element :data, '.data'
48
49
  end
49
50
  end
@@ -89,6 +90,10 @@ class Advanced
89
90
 
90
91
  element :birthday, '.birthday', as: :date
91
92
 
93
+ elements :numbers, 'span' do |node|
94
+ node.text.to_i * 10
95
+ end
96
+
92
97
  private
93
98
 
94
99
  def extract_date(node)
@@ -99,15 +104,39 @@ end
99
104
  advanced = Advanced.new(<<-EOF)
100
105
  <h1 class="name">Millard Fillmore</h1>
101
106
  <h2 class="birthday">7-1-1800</h2>
107
+ <span>1</span>
108
+ <span>2</span>
102
109
  EOF
103
110
 
104
111
  advanced.first_name #=> 'Millard'
105
112
  advanced.birthday #=> #<Date: 1800-01-07>
113
+ advanced.numbers #=> [10, 20]
114
+ ```
115
+
116
+ ## HTTP
117
+
118
+ Scraping is totally agnostic of HTTP, but if you need a suggestion, check out [HTTParty](https://github.com/jnunemaker/httparty).
119
+
120
+ ```ruby
121
+ class HackerNews
122
+ include HTTParty
123
+ include Scraping
124
+
125
+ base_uri 'https://news.ycombinator.com'
126
+ elements :stories, '.athing .title > a'
127
+
128
+ def self.scrape
129
+ super get('/').body
130
+ end
131
+ end
132
+
133
+ news = HackerNews.scrape
134
+ puts news.stories.inspect
106
135
  ```
107
136
 
108
137
  ## Contributing
109
138
 
110
- Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/scraping.
139
+ Bug reports and pull requests are welcome on GitHub at https://github.com/promptworks/scraping.
111
140
 
112
141
  ## License
113
142
 
@@ -2,8 +2,9 @@ require 'nokogiri'
2
2
  require 'scraping/version'
3
3
  require 'scraping/dsl'
4
4
  require 'scraping/rules/element'
5
- require 'scraping/rules/elements_of'
6
5
  require 'scraping/rules/elements'
6
+ require 'scraping/rules/section'
7
+ require 'scraping/rules/sections'
7
8
 
8
9
  module Scraping
9
10
  def self.included(base)
@@ -38,12 +39,17 @@ module Scraping
38
39
  super
39
40
  end
40
41
 
41
- def elements_of(name)
42
+ def elements(name, *)
42
43
  attr_accessor name
43
44
  super
44
45
  end
45
46
 
46
- def elements(name, *)
47
+ def section(name, *)
48
+ attr_accessor name
49
+ super
50
+ end
51
+
52
+ def sections(name, *)
47
53
  attr_accessor name
48
54
  super
49
55
  end
@@ -8,12 +8,16 @@ module Scraping
8
8
  rules[name] = Rules::Element.new(name, selector, options, &block)
9
9
  end
10
10
 
11
- def elements_of(name, &block)
12
- rules[name] = Rules::ElementsOf.new(name).evaluate(&block)
11
+ def elements(name, selector, options = {}, &block)
12
+ rules[name] = Rules::Elements.new(name, selector, options, &block)
13
13
  end
14
14
 
15
- def elements(name, selector, options = {}, &block)
16
- rules[name] = Rules::Elements.new(name, selector, options).evaluate(&block)
15
+ def section(name, selector = '.', &block)
16
+ rules[name] = Rules::Section.new(name, selector).evaluate(&block)
17
+ end
18
+
19
+ def sections(name, selector, &block)
20
+ rules[name] = Rules::Sections.new(name, selector).evaluate(&block)
17
21
  end
18
22
  end
19
23
  end
@@ -1,27 +1,18 @@
1
+ require 'scraping/rules/element'
2
+
1
3
  module Scraping
2
4
  module Rules
3
- class Elements
4
- attr_reader :name, :selector, :rule, :options
5
-
6
- def initialize(name, selector, options = {})
7
- @name = name
8
- @selector = selector
9
- @options = options
10
- end
11
-
12
- def evaluate(&block)
13
- if block_given?
14
- @rule = ElementsOf.new(name).evaluate(&block)
15
- else
16
- @rule = Element.new(name, '.', options)
17
- end
5
+ class Elements < Element
6
+ attr_reader :multiselector
18
7
 
19
- self
8
+ def initialize(name, selector, options = {}, &extract)
9
+ super(name, '.', options, &extract)
10
+ @multiselector = selector
20
11
  end
21
12
 
22
13
  def call(scraper, node)
23
- node.search(selector).map do |item|
24
- rule.call(scraper, item)
14
+ node.search(multiselector).map do |item|
15
+ super scraper, item
25
16
  end
26
17
  end
27
18
  end
@@ -2,12 +2,13 @@ require 'ostruct'
2
2
 
3
3
  module Scraping
4
4
  module Rules
5
- class ElementsOf
5
+ class Section
6
6
  include DSL
7
- attr_reader :name
7
+ attr_reader :name, :selector
8
8
 
9
- def initialize(name)
9
+ def initialize(name, selector = '.')
10
10
  @name = name
11
+ @selector = selector
11
12
  end
12
13
 
13
14
  def evaluate(&block)
@@ -17,7 +18,7 @@ module Scraping
17
18
 
18
19
  def call(scraper, node)
19
20
  rules.inject(OpenStruct.new) do |obj, (name, rule)|
20
- obj[name] = rule.call(scraper, node)
21
+ obj[name] = rule.call scraper, node.at(selector)
21
22
  obj
22
23
  end
23
24
  end
@@ -0,0 +1,20 @@
1
+ require 'scraping/rules/section'
2
+
3
+ module Scraping
4
+ module Rules
5
+ class Sections < Section
6
+ attr_reader :multiselector
7
+
8
+ def initialize(name, selector)
9
+ super name, '.'
10
+ @multiselector = selector
11
+ end
12
+
13
+ def call(scraper, node)
14
+ node.search(multiselector).map do |item|
15
+ super scraper, item
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -1,3 +1,3 @@
1
1
  module Scraping
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraping
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ray Zane
@@ -83,7 +83,8 @@ files:
83
83
  - lib/scraping/dsl.rb
84
84
  - lib/scraping/rules/element.rb
85
85
  - lib/scraping/rules/elements.rb
86
- - lib/scraping/rules/elements_of.rb
86
+ - lib/scraping/rules/section.rb
87
+ - lib/scraping/rules/sections.rb
87
88
  - lib/scraping/version.rb
88
89
  - scraping.gemspec
89
90
  homepage: https://github.com/rzane/scraping