scraping 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +32 -3
- data/lib/scraping.rb +9 -3
- data/lib/scraping/dsl.rb +8 -4
- data/lib/scraping/rules/elements.rb +9 -18
- data/lib/scraping/rules/{elements_of.rb → section.rb} +5 -4
- data/lib/scraping/rules/sections.rb +20 -0
- data/lib/scraping/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d078337c4224a96b2587427a06556877d69eb656
|
4
|
+
data.tar.gz: dc8bdf3cc7a53568c99d869dfdda769c17c79d7b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ca76e4ac712fc15fea39a3f7d76298bf20e651410ec8a515a63345d4563740e2f59caede6f6b0a314cbb7335c7a72888c336ef0ef18bd32ee10bfe7fc21da921
|
7
|
+
data.tar.gz: c42df98a5a21a72b6cb7397dd34a099ec40ef36567c3f7a7b76dfcb3a4a864c8fe034276dd6ecf53d347e442292e939471c43ac0704d4080c302062f40dfdcb9
|
data/README.md
CHANGED
@@ -35,15 +35,16 @@ You can also scrape arrays, objects, and arrays of objects. `elements` and `elem
|
|
35
35
|
```ruby
|
36
36
|
class YouCan
|
37
37
|
include Scraping
|
38
|
+
|
38
39
|
elements :scrape, '.scrape'
|
39
40
|
|
40
|
-
|
41
|
+
sections :also_scrape, '.also-scrape li' do
|
41
42
|
element :name, 'a'
|
42
43
|
element :link, 'a/@href'
|
43
44
|
elements :numbers, 'span'
|
44
45
|
end
|
45
46
|
|
46
|
-
|
47
|
+
section :nested_scrape do
|
47
48
|
element :data, '.data'
|
48
49
|
end
|
49
50
|
end
|
@@ -89,6 +90,10 @@ class Advanced
|
|
89
90
|
|
90
91
|
element :birthday, '.birthday', as: :date
|
91
92
|
|
93
|
+
elements :numbers, 'span' do |node|
|
94
|
+
node.text.to_i * 10
|
95
|
+
end
|
96
|
+
|
92
97
|
private
|
93
98
|
|
94
99
|
def extract_date(node)
|
@@ -99,15 +104,39 @@ end
|
|
99
104
|
advanced = Advanced.new(<<-EOF)
|
100
105
|
<h1 class="name">Millard Fillmore</h1>
|
101
106
|
<h2 class="birthday">7-1-1800</h2>
|
107
|
+
<span>1</span>
|
108
|
+
<span>2</span>
|
102
109
|
EOF
|
103
110
|
|
104
111
|
advanced.first_name #=> 'Millard'
|
105
112
|
advanced.birthday #=> #<Date: 1800-01-07>
|
113
|
+
advanced.numbers #=> [10, 20]
|
114
|
+
```
|
115
|
+
|
116
|
+
## HTTP
|
117
|
+
|
118
|
+
Scraping is totally agnostic of HTTP, but if you need a suggestion, check out [HTTParty](https://github.com/jnunemaker/httparty).
|
119
|
+
|
120
|
+
```ruby
|
121
|
+
class HackerNews
|
122
|
+
include HTTParty
|
123
|
+
include Scraping
|
124
|
+
|
125
|
+
base_uri 'https://news.ycombinator.com'
|
126
|
+
elements :stories, '.athing .title > a'
|
127
|
+
|
128
|
+
def self.scrape
|
129
|
+
super get('/').body
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
news = HackerNews.scrape
|
134
|
+
puts news.stories.inspect
|
106
135
|
```
|
107
136
|
|
108
137
|
## Contributing
|
109
138
|
|
110
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
139
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/promptworks/scraping.
|
111
140
|
|
112
141
|
## License
|
113
142
|
|
data/lib/scraping.rb
CHANGED
@@ -2,8 +2,9 @@ require 'nokogiri'
|
|
2
2
|
require 'scraping/version'
|
3
3
|
require 'scraping/dsl'
|
4
4
|
require 'scraping/rules/element'
|
5
|
-
require 'scraping/rules/elements_of'
|
6
5
|
require 'scraping/rules/elements'
|
6
|
+
require 'scraping/rules/section'
|
7
|
+
require 'scraping/rules/sections'
|
7
8
|
|
8
9
|
module Scraping
|
9
10
|
def self.included(base)
|
@@ -38,12 +39,17 @@ module Scraping
|
|
38
39
|
super
|
39
40
|
end
|
40
41
|
|
41
|
-
def
|
42
|
+
def elements(name, *)
|
42
43
|
attr_accessor name
|
43
44
|
super
|
44
45
|
end
|
45
46
|
|
46
|
-
def
|
47
|
+
def section(name, *)
|
48
|
+
attr_accessor name
|
49
|
+
super
|
50
|
+
end
|
51
|
+
|
52
|
+
def sections(name, *)
|
47
53
|
attr_accessor name
|
48
54
|
super
|
49
55
|
end
|
data/lib/scraping/dsl.rb
CHANGED
@@ -8,12 +8,16 @@ module Scraping
|
|
8
8
|
rules[name] = Rules::Element.new(name, selector, options, &block)
|
9
9
|
end
|
10
10
|
|
11
|
-
def
|
12
|
-
rules[name] = Rules::
|
11
|
+
def elements(name, selector, options = {}, &block)
|
12
|
+
rules[name] = Rules::Elements.new(name, selector, options, &block)
|
13
13
|
end
|
14
14
|
|
15
|
-
def
|
16
|
-
rules[name] = Rules::
|
15
|
+
def section(name, selector = '.', &block)
|
16
|
+
rules[name] = Rules::Section.new(name, selector).evaluate(&block)
|
17
|
+
end
|
18
|
+
|
19
|
+
def sections(name, selector, &block)
|
20
|
+
rules[name] = Rules::Sections.new(name, selector).evaluate(&block)
|
17
21
|
end
|
18
22
|
end
|
19
23
|
end
|
@@ -1,27 +1,18 @@
|
|
1
|
+
require 'scraping/rules/element'
|
2
|
+
|
1
3
|
module Scraping
|
2
4
|
module Rules
|
3
|
-
class Elements
|
4
|
-
attr_reader :
|
5
|
-
|
6
|
-
def initialize(name, selector, options = {})
|
7
|
-
@name = name
|
8
|
-
@selector = selector
|
9
|
-
@options = options
|
10
|
-
end
|
11
|
-
|
12
|
-
def evaluate(&block)
|
13
|
-
if block_given?
|
14
|
-
@rule = ElementsOf.new(name).evaluate(&block)
|
15
|
-
else
|
16
|
-
@rule = Element.new(name, '.', options)
|
17
|
-
end
|
5
|
+
class Elements < Element
|
6
|
+
attr_reader :multiselector
|
18
7
|
|
19
|
-
|
8
|
+
def initialize(name, selector, options = {}, &extract)
|
9
|
+
super(name, '.', options, &extract)
|
10
|
+
@multiselector = selector
|
20
11
|
end
|
21
12
|
|
22
13
|
def call(scraper, node)
|
23
|
-
node.search(
|
24
|
-
|
14
|
+
node.search(multiselector).map do |item|
|
15
|
+
super scraper, item
|
25
16
|
end
|
26
17
|
end
|
27
18
|
end
|
@@ -2,12 +2,13 @@ require 'ostruct'
|
|
2
2
|
|
3
3
|
module Scraping
|
4
4
|
module Rules
|
5
|
-
class
|
5
|
+
class Section
|
6
6
|
include DSL
|
7
|
-
attr_reader :name
|
7
|
+
attr_reader :name, :selector
|
8
8
|
|
9
|
-
def initialize(name)
|
9
|
+
def initialize(name, selector = '.')
|
10
10
|
@name = name
|
11
|
+
@selector = selector
|
11
12
|
end
|
12
13
|
|
13
14
|
def evaluate(&block)
|
@@ -17,7 +18,7 @@ module Scraping
|
|
17
18
|
|
18
19
|
def call(scraper, node)
|
19
20
|
rules.inject(OpenStruct.new) do |obj, (name, rule)|
|
20
|
-
obj[name] = rule.call
|
21
|
+
obj[name] = rule.call scraper, node.at(selector)
|
21
22
|
obj
|
22
23
|
end
|
23
24
|
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'scraping/rules/section'
|
2
|
+
|
3
|
+
module Scraping
|
4
|
+
module Rules
|
5
|
+
class Sections < Section
|
6
|
+
attr_reader :multiselector
|
7
|
+
|
8
|
+
def initialize(name, selector)
|
9
|
+
super name, '.'
|
10
|
+
@multiselector = selector
|
11
|
+
end
|
12
|
+
|
13
|
+
def call(scraper, node)
|
14
|
+
node.search(multiselector).map do |item|
|
15
|
+
super scraper, item
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/lib/scraping/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scraping
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ray Zane
|
@@ -83,7 +83,8 @@ files:
|
|
83
83
|
- lib/scraping/dsl.rb
|
84
84
|
- lib/scraping/rules/element.rb
|
85
85
|
- lib/scraping/rules/elements.rb
|
86
|
-
- lib/scraping/rules/
|
86
|
+
- lib/scraping/rules/section.rb
|
87
|
+
- lib/scraping/rules/sections.rb
|
87
88
|
- lib/scraping/version.rb
|
88
89
|
- scraping.gemspec
|
89
90
|
homepage: https://github.com/rzane/scraping
|