scraping 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +32 -3
- data/lib/scraping.rb +9 -3
- data/lib/scraping/dsl.rb +8 -4
- data/lib/scraping/rules/elements.rb +9 -18
- data/lib/scraping/rules/{elements_of.rb → section.rb} +5 -4
- data/lib/scraping/rules/sections.rb +20 -0
- data/lib/scraping/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d078337c4224a96b2587427a06556877d69eb656
|
4
|
+
data.tar.gz: dc8bdf3cc7a53568c99d869dfdda769c17c79d7b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ca76e4ac712fc15fea39a3f7d76298bf20e651410ec8a515a63345d4563740e2f59caede6f6b0a314cbb7335c7a72888c336ef0ef18bd32ee10bfe7fc21da921
|
7
|
+
data.tar.gz: c42df98a5a21a72b6cb7397dd34a099ec40ef36567c3f7a7b76dfcb3a4a864c8fe034276dd6ecf53d347e442292e939471c43ac0704d4080c302062f40dfdcb9
|
data/README.md
CHANGED
@@ -35,15 +35,16 @@ You can also scrape arrays, objects, and arrays of objects. `elements` and `elem
|
|
35
35
|
```ruby
|
36
36
|
class YouCan
|
37
37
|
include Scraping
|
38
|
+
|
38
39
|
elements :scrape, '.scrape'
|
39
40
|
|
40
|
-
|
41
|
+
sections :also_scrape, '.also-scrape li' do
|
41
42
|
element :name, 'a'
|
42
43
|
element :link, 'a/@href'
|
43
44
|
elements :numbers, 'span'
|
44
45
|
end
|
45
46
|
|
46
|
-
|
47
|
+
section :nested_scrape do
|
47
48
|
element :data, '.data'
|
48
49
|
end
|
49
50
|
end
|
@@ -89,6 +90,10 @@ class Advanced
|
|
89
90
|
|
90
91
|
element :birthday, '.birthday', as: :date
|
91
92
|
|
93
|
+
elements :numbers, 'span' do |node|
|
94
|
+
node.text.to_i * 10
|
95
|
+
end
|
96
|
+
|
92
97
|
private
|
93
98
|
|
94
99
|
def extract_date(node)
|
@@ -99,15 +104,39 @@ end
|
|
99
104
|
advanced = Advanced.new(<<-EOF)
|
100
105
|
<h1 class="name">Millard Fillmore</h1>
|
101
106
|
<h2 class="birthday">7-1-1800</h2>
|
107
|
+
<span>1</span>
|
108
|
+
<span>2</span>
|
102
109
|
EOF
|
103
110
|
|
104
111
|
advanced.first_name #=> 'Millard'
|
105
112
|
advanced.birthday #=> #<Date: 1800-01-07>
|
113
|
+
advanced.numbers #=> [10, 20]
|
114
|
+
```
|
115
|
+
|
116
|
+
## HTTP
|
117
|
+
|
118
|
+
Scraping is totally agnostic of HTTP, but if you need a suggestion, check out [HTTParty](https://github.com/jnunemaker/httparty).
|
119
|
+
|
120
|
+
```ruby
|
121
|
+
class HackerNews
|
122
|
+
include HTTParty
|
123
|
+
include Scraping
|
124
|
+
|
125
|
+
base_uri 'https://news.ycombinator.com'
|
126
|
+
elements :stories, '.athing .title > a'
|
127
|
+
|
128
|
+
def self.scrape
|
129
|
+
super get('/').body
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
news = HackerNews.scrape
|
134
|
+
puts news.stories.inspect
|
106
135
|
```
|
107
136
|
|
108
137
|
## Contributing
|
109
138
|
|
110
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
139
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/promptworks/scraping.
|
111
140
|
|
112
141
|
## License
|
113
142
|
|
data/lib/scraping.rb
CHANGED
@@ -2,8 +2,9 @@ require 'nokogiri'
|
|
2
2
|
require 'scraping/version'
|
3
3
|
require 'scraping/dsl'
|
4
4
|
require 'scraping/rules/element'
|
5
|
-
require 'scraping/rules/elements_of'
|
6
5
|
require 'scraping/rules/elements'
|
6
|
+
require 'scraping/rules/section'
|
7
|
+
require 'scraping/rules/sections'
|
7
8
|
|
8
9
|
module Scraping
|
9
10
|
def self.included(base)
|
@@ -38,12 +39,17 @@ module Scraping
|
|
38
39
|
super
|
39
40
|
end
|
40
41
|
|
41
|
-
def
|
42
|
+
def elements(name, *)
|
42
43
|
attr_accessor name
|
43
44
|
super
|
44
45
|
end
|
45
46
|
|
46
|
-
def
|
47
|
+
def section(name, *)
|
48
|
+
attr_accessor name
|
49
|
+
super
|
50
|
+
end
|
51
|
+
|
52
|
+
def sections(name, *)
|
47
53
|
attr_accessor name
|
48
54
|
super
|
49
55
|
end
|
data/lib/scraping/dsl.rb
CHANGED
@@ -8,12 +8,16 @@ module Scraping
|
|
8
8
|
rules[name] = Rules::Element.new(name, selector, options, &block)
|
9
9
|
end
|
10
10
|
|
11
|
-
def
|
12
|
-
rules[name] = Rules::
|
11
|
+
def elements(name, selector, options = {}, &block)
|
12
|
+
rules[name] = Rules::Elements.new(name, selector, options, &block)
|
13
13
|
end
|
14
14
|
|
15
|
-
def
|
16
|
-
rules[name] = Rules::
|
15
|
+
def section(name, selector = '.', &block)
|
16
|
+
rules[name] = Rules::Section.new(name, selector).evaluate(&block)
|
17
|
+
end
|
18
|
+
|
19
|
+
def sections(name, selector, &block)
|
20
|
+
rules[name] = Rules::Sections.new(name, selector).evaluate(&block)
|
17
21
|
end
|
18
22
|
end
|
19
23
|
end
|
@@ -1,27 +1,18 @@
|
|
1
|
+
require 'scraping/rules/element'
|
2
|
+
|
1
3
|
module Scraping
|
2
4
|
module Rules
|
3
|
-
class Elements
|
4
|
-
attr_reader :
|
5
|
-
|
6
|
-
def initialize(name, selector, options = {})
|
7
|
-
@name = name
|
8
|
-
@selector = selector
|
9
|
-
@options = options
|
10
|
-
end
|
11
|
-
|
12
|
-
def evaluate(&block)
|
13
|
-
if block_given?
|
14
|
-
@rule = ElementsOf.new(name).evaluate(&block)
|
15
|
-
else
|
16
|
-
@rule = Element.new(name, '.', options)
|
17
|
-
end
|
5
|
+
class Elements < Element
|
6
|
+
attr_reader :multiselector
|
18
7
|
|
19
|
-
|
8
|
+
def initialize(name, selector, options = {}, &extract)
|
9
|
+
super(name, '.', options, &extract)
|
10
|
+
@multiselector = selector
|
20
11
|
end
|
21
12
|
|
22
13
|
def call(scraper, node)
|
23
|
-
node.search(
|
24
|
-
|
14
|
+
node.search(multiselector).map do |item|
|
15
|
+
super scraper, item
|
25
16
|
end
|
26
17
|
end
|
27
18
|
end
|
@@ -2,12 +2,13 @@ require 'ostruct'
|
|
2
2
|
|
3
3
|
module Scraping
|
4
4
|
module Rules
|
5
|
-
class
|
5
|
+
class Section
|
6
6
|
include DSL
|
7
|
-
attr_reader :name
|
7
|
+
attr_reader :name, :selector
|
8
8
|
|
9
|
-
def initialize(name)
|
9
|
+
def initialize(name, selector = '.')
|
10
10
|
@name = name
|
11
|
+
@selector = selector
|
11
12
|
end
|
12
13
|
|
13
14
|
def evaluate(&block)
|
@@ -17,7 +18,7 @@ module Scraping
|
|
17
18
|
|
18
19
|
def call(scraper, node)
|
19
20
|
rules.inject(OpenStruct.new) do |obj, (name, rule)|
|
20
|
-
obj[name] = rule.call
|
21
|
+
obj[name] = rule.call scraper, node.at(selector)
|
21
22
|
obj
|
22
23
|
end
|
23
24
|
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'scraping/rules/section'
|
2
|
+
|
3
|
+
module Scraping
|
4
|
+
module Rules
|
5
|
+
class Sections < Section
|
6
|
+
attr_reader :multiselector
|
7
|
+
|
8
|
+
def initialize(name, selector)
|
9
|
+
super name, '.'
|
10
|
+
@multiselector = selector
|
11
|
+
end
|
12
|
+
|
13
|
+
def call(scraper, node)
|
14
|
+
node.search(multiselector).map do |item|
|
15
|
+
super scraper, item
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/lib/scraping/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scraping
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ray Zane
|
@@ -83,7 +83,8 @@ files:
|
|
83
83
|
- lib/scraping/dsl.rb
|
84
84
|
- lib/scraping/rules/element.rb
|
85
85
|
- lib/scraping/rules/elements.rb
|
86
|
-
- lib/scraping/rules/
|
86
|
+
- lib/scraping/rules/section.rb
|
87
|
+
- lib/scraping/rules/sections.rb
|
87
88
|
- lib/scraping/version.rb
|
88
89
|
- scraping.gemspec
|
89
90
|
homepage: https://github.com/rzane/scraping
|