maiha-dm-ys 0.2.2 → 0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README +25 -0
- data/Rakefile +1 -1
- data/lib/dm-ys/base.rb +9 -0
- data/lib/dm-ys/proxy.rb +3 -12
- data/lib/dm-ys/scraper.rb +189 -66
- data/spec/anonymous_spec.rb +26 -0
- data/spec/composite_scraper_spec.rb +69 -0
- data/spec/data/blank.html +0 -0
- data/spec/data/gem_maintainers.html +226 -0
- data/spec/data/plugins1.html +258 -0
- data/spec/data/plugins2.html +224 -0
- data/spec/guess_spec.rb +16 -15
- data/spec/indexed_property_spec.rb +6 -11
- data/spec/models/gem_maintainer.rb +4 -0
- data/spec/models/plugin.rb +14 -0
- data/spec/pagination_spec.rb +7 -0
- data/spec/scraper_spec.rb +33 -0
- data/spec/scraper_utils_spec.rb +35 -0
- data/spec/spec_helper.rb +28 -2
- metadata +12 -4
data/README
CHANGED
@@ -24,4 +24,29 @@ Example
|
|
24
24
|
=> ["Gem", "Maintainer", "Name"]
|
25
25
|
|
26
26
|
|
27
|
+
Pagination
|
28
|
+
==========
|
29
|
+
|
30
|
+
Special syntax "uri*" means pagination mode that enables recursive retrieving.
|
31
|
+
|
32
|
+
class Plugin
|
33
|
+
include DataMapper::YunkerStar
|
34
|
+
uri "http://merbi.st/plugins/index?page=1"
|
35
|
+
end
|
36
|
+
|
37
|
+
irb(main):001:0> Plugin.count
|
38
|
+
=> 20
|
39
|
+
|
40
|
+
This parses only specified uri.
|
41
|
+
Append "*" to uri if you want pagination mode.
|
42
|
+
|
43
|
+
class Plugin
|
44
|
+
include DataMapper::YunkerStar
|
45
|
+
uri "http://merbi.st/plugins/index?page=1*"
|
46
|
+
end
|
47
|
+
|
48
|
+
irb(main):001:0> Plugin.count
|
49
|
+
=> 36
|
50
|
+
|
51
|
+
|
27
52
|
Copyright (c) 2008 maiha@wota.jp, released under the MIT license
|
data/Rakefile
CHANGED
@@ -33,7 +33,7 @@ AUTHOR = "maiha"
|
|
33
33
|
EMAIL = "maiha@wota.jp"
|
34
34
|
HOMEPAGE = "http://github.com/maiha/dm-ys"
|
35
35
|
SUMMARY = "a DataMapper extension that uses html table as its schema and data powerfully like YunkerStar"
|
36
|
-
GEM_VERSION = "0.
|
36
|
+
GEM_VERSION = "0.3"
|
37
37
|
|
38
38
|
spec = Gem::Specification.new do |s|
|
39
39
|
# s.rubyforge_project = 'merb'
|
data/lib/dm-ys/base.rb
CHANGED
data/lib/dm-ys/proxy.rb
CHANGED
@@ -24,16 +24,7 @@ module DataMapper
|
|
24
24
|
|
25
25
|
module ClassMethods
|
26
26
|
def proxy
|
27
|
-
@proxy ||=
|
28
|
-
end
|
29
|
-
|
30
|
-
def lazy_load
|
31
|
-
loader = Scraper.new(self)
|
32
|
-
loader.names.each do |name|
|
33
|
-
type = String # TODO
|
34
|
-
property name.intern, type
|
35
|
-
end
|
36
|
-
return loader
|
27
|
+
@proxy ||= Scraper.load(self)
|
37
28
|
end
|
38
29
|
|
39
30
|
def names
|
@@ -54,9 +45,9 @@ module DataMapper
|
|
54
45
|
|
55
46
|
def all
|
56
47
|
count = 0
|
57
|
-
@all ||=
|
48
|
+
@all ||= entries.map{|array|
|
58
49
|
count += 1
|
59
|
-
new(Hash[*
|
50
|
+
new(Hash[*names.zip(array).flatten].merge(:id=>count))
|
60
51
|
}
|
61
52
|
end
|
62
53
|
|
data/lib/dm-ys/scraper.rb
CHANGED
@@ -8,96 +8,219 @@ module DataMapper
|
|
8
8
|
require 'open-uri'
|
9
9
|
require 'hpricot'
|
10
10
|
|
11
|
-
|
12
|
-
include CachedAccessor
|
13
|
-
|
11
|
+
module Scraper
|
14
12
|
class TableNotFound < RuntimeError; end
|
15
13
|
|
16
|
-
|
14
|
+
def self.paginate?(model)
|
15
|
+
model.uri.to_s[-1] == ?*
|
16
|
+
end
|
17
17
|
|
18
|
-
def
|
19
|
-
|
20
|
-
|
21
|
-
@model = model
|
22
|
-
@html = NKF.nkf('-w', open(model.uri).read)
|
23
|
-
@invalid_name_count = 0
|
18
|
+
def self.lookup(model)
|
19
|
+
scraper = paginate?(model) ? Composite : Page
|
20
|
+
scraper.new(model)
|
24
21
|
end
|
25
22
|
|
26
|
-
def
|
27
|
-
|
28
|
-
|
23
|
+
def self.load(model)
|
24
|
+
loader = lookup(model)
|
25
|
+
loader.register_properties!
|
26
|
+
return loader
|
29
27
|
end
|
30
28
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
29
|
+
######################################################################
|
30
|
+
### Utils
|
31
|
+
|
32
|
+
module Utils
|
33
|
+
def constantize(label)
|
34
|
+
require 'cgi'
|
35
|
+
label = CGI.unescapeHTML(label.to_s)
|
36
|
+
label.gsub!(/&[a-z]+;/, '')
|
37
|
+
label.gsub!(/\r?\n/, '')
|
38
|
+
label.gsub!(/\s+/,'')
|
39
|
+
label.delete!('!"#$%&()=~|`{}^-[]/<>:;,.\\-')
|
40
|
+
label.delete!("'")
|
41
|
+
return label
|
42
|
+
end
|
43
|
+
|
44
|
+
module_function :constantize
|
39
45
|
end
|
40
46
|
|
41
|
-
|
47
|
+
######################################################################
|
48
|
+
### Base Scraper
|
42
49
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
[count, table]
|
50
|
+
class Base
|
51
|
+
include CachedAccessor
|
52
|
+
|
53
|
+
def initialize(model, *args)
|
54
|
+
raise ArgumentError, "missing model" unless model
|
55
|
+
raise ArgumentError, "missing uri" unless model.uri
|
56
|
+
@model = model
|
57
|
+
end
|
58
|
+
[:names, :labels, :entries].each do |method|
|
59
|
+
define_method(method) {raise NotImplementedError, method.to_s}
|
54
60
|
end
|
55
61
|
|
56
|
-
def
|
57
|
-
@model.
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
when Hpricot::Elements
|
65
|
-
return element.first
|
66
|
-
else
|
67
|
-
return nil
|
62
|
+
def uri
|
63
|
+
@uri || @model.uri.to_s.chomp('*')
|
64
|
+
end
|
65
|
+
|
66
|
+
def register_properties!
|
67
|
+
names.each do |name|
|
68
|
+
type = String # TODO
|
69
|
+
@model.property name.intern, type
|
68
70
|
end
|
69
71
|
end
|
72
|
+
end
|
70
73
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
74
|
+
######################################################################
|
75
|
+
### Page Scraper
|
76
|
+
|
77
|
+
class Page < Base
|
78
|
+
attr_reader :html
|
79
|
+
|
80
|
+
def initialize(model, uri = nil)
|
81
|
+
super
|
82
|
+
@uri = uri
|
83
|
+
@html = NKF.nkf('-w', open(self.uri).read)
|
84
|
+
@invalid_name_count = 0
|
85
|
+
end
|
86
|
+
|
87
|
+
def guess_table
|
88
|
+
max_table or
|
89
|
+
raise TableNotFound, "set 'table' or 'tbody' manually"
|
90
|
+
end
|
91
|
+
|
92
|
+
def pagination_links
|
93
|
+
base = URI.parse(uri.split('?').first)
|
94
|
+
urls = (doc / "a").map{|i| i[:href] =~ /^http/ ? i[:href] : (base+i[:href]).to_s}.uniq
|
95
|
+
urls.select{|url| /^#{Regexp.escape(base.to_s)}/ === url}
|
96
|
+
end
|
97
|
+
|
98
|
+
def inspect
|
99
|
+
attrs = [
|
100
|
+
[ :html, "#{html.size}bytes" ],
|
101
|
+
[ :names, names ],
|
102
|
+
[ :entries, entries.size ],
|
103
|
+
]
|
104
|
+
"#<#{self.class.name} #{attrs.map { |(k,v)| "@#{k}=#{v.inspect}" } * ' '}>"
|
105
|
+
end
|
106
|
+
|
107
|
+
cached_accessor do
|
108
|
+
doc {Hpricot(@html)}
|
109
|
+
table {specified(:table) or guess_table}
|
110
|
+
thead {specified(:thead) or table.search("> thead").first or table}
|
111
|
+
tbody {specified(:tbody) or table.search("> tbody").first or table}
|
112
|
+
names {labels.map{|i| label2name(i)}}
|
113
|
+
labels {thead.search("> tr").first.search("> td|th").map{|i|strip_tags(i.inner_html)}}
|
114
|
+
entries {tbody.search("> tr").map{|tr| tr.search("> td").map{|i|strip_tags(i.inner_html)}}.delete_if{|i|i.blank?}}
|
115
|
+
count {entries.size}
|
116
|
+
end
|
79
117
|
|
80
|
-
|
81
|
-
|
118
|
+
private
|
119
|
+
|
120
|
+
def max_table
|
121
|
+
table = nil
|
122
|
+
count = -1
|
123
|
+
doc.search("table").each do |t|
|
124
|
+
size = [t.search("> tr").size, t.search("> tbody > tr").size].max
|
125
|
+
if size > count
|
126
|
+
count = size
|
127
|
+
table = t
|
128
|
+
end
|
129
|
+
end
|
130
|
+
return table
|
131
|
+
end
|
132
|
+
|
133
|
+
def specified(name)
|
134
|
+
@model.respond_to?(name) or raise ArgumentError, "invalid selector name: #{name}"
|
135
|
+
css = @model.__send__(name) or return nil
|
136
|
+
|
137
|
+
element = doc.search(css)
|
138
|
+
case element
|
139
|
+
when Hpricot::Elem
|
140
|
+
return element
|
141
|
+
when Hpricot::Elements
|
142
|
+
return element.first
|
143
|
+
else
|
144
|
+
return nil
|
145
|
+
end
|
82
146
|
end
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
147
|
+
|
148
|
+
def label2name(label)
|
149
|
+
label = Utils.constantize(label)
|
150
|
+
|
151
|
+
if /^([A-Z])/ === label and Object.const_defined?(label)
|
152
|
+
label = "_#{label}"
|
153
|
+
end
|
154
|
+
if label.blank? or @model.respond_to?(label, true)
|
155
|
+
new_name_for(label)
|
156
|
+
elsif /^[0-9]/ === label
|
157
|
+
"_#{label}"
|
158
|
+
else
|
159
|
+
label
|
160
|
+
end
|
89
161
|
end
|
162
|
+
|
163
|
+
def new_name_for(label)
|
164
|
+
@invalid_name_count += 1
|
165
|
+
"col_#{@invalid_name_count}"
|
166
|
+
end
|
167
|
+
|
168
|
+
def strip_tags(html)
|
169
|
+
html.gsub(/<.*?>/, '').strip
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
######################################################################
|
174
|
+
### Composite Scraper
|
175
|
+
|
176
|
+
class Composite < Base
|
177
|
+
def pages
|
178
|
+
@pages ||= execute
|
90
179
|
end
|
91
180
|
|
92
|
-
def
|
93
|
-
|
94
|
-
"col_#{@invalid_name_count}"
|
181
|
+
def count
|
182
|
+
pages.map(&:count).inject(0){|i,v| i+v}
|
95
183
|
end
|
96
184
|
|
97
|
-
def
|
98
|
-
|
185
|
+
def names
|
186
|
+
pages.first.names
|
99
187
|
end
|
100
188
|
|
189
|
+
def labels
|
190
|
+
pages.first.labels
|
191
|
+
end
|
192
|
+
|
193
|
+
def entries
|
194
|
+
pages.inject([]){|a,p| a+p.entries}
|
195
|
+
end
|
196
|
+
|
197
|
+
private
|
198
|
+
def execute
|
199
|
+
visit(uri)
|
200
|
+
valid_pages
|
201
|
+
end
|
202
|
+
|
203
|
+
def valid_pages
|
204
|
+
loaded_pages.values.compact
|
205
|
+
end
|
206
|
+
|
207
|
+
def loaded_pages
|
208
|
+
@loaded_pages ||= {} # url => page object
|
209
|
+
end
|
210
|
+
|
211
|
+
def visit(uri)
|
212
|
+
return if loaded_pages[uri]
|
213
|
+
page = Page.new(@model, uri)
|
214
|
+
base = valid_pages.first
|
215
|
+
if !base or base.names == page.names
|
216
|
+
loaded_pages[uri] = page
|
217
|
+
else
|
218
|
+
loaded_pages[uri] = nil
|
219
|
+
end
|
220
|
+
page.pagination_links.each{|uri| visit(uri)}
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
101
224
|
end
|
102
225
|
end
|
103
226
|
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require File.join( File.dirname(__FILE__), "spec_helper" )
|
2
|
+
|
3
|
+
describe DataMapper::YunkerStar do
|
4
|
+
it "should provide []" do
|
5
|
+
DataMapper::YunkerStar.should respond_to(:[])
|
6
|
+
end
|
7
|
+
|
8
|
+
describe "[]" do
|
9
|
+
before(:each) do
|
10
|
+
@uri = "http://merbi.st/plugins/index?page=1"
|
11
|
+
@ys = DataMapper::YunkerStar[@uri]
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should return a new class" do
|
15
|
+
@ys.should be_kind_of(Class)
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should include DataMapper::YunkerStar" do
|
19
|
+
@ys.ancestors.should be_include(DataMapper::YunkerStar)
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should set uri" do
|
23
|
+
@ys.uri.should == @uri
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require File.join( File.dirname(__FILE__), "spec_helper" )
|
2
|
+
|
3
|
+
describe DataMapper::YunkerStar::Scraper::Composite do
|
4
|
+
before(:each) do
|
5
|
+
@scraper = DataMapper::YunkerStar::Scraper::Composite.new(Plugin)
|
6
|
+
end
|
7
|
+
|
8
|
+
it "should provide #uri" do
|
9
|
+
@scraper.should respond_to(:uri)
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "#uri" do
|
13
|
+
it "should strip last '*'" do
|
14
|
+
@scraper.uri.should == "http://merbi.st/plugins/index?page=1"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should provide #pages" do
|
19
|
+
@scraper.should respond_to(:pages)
|
20
|
+
end
|
21
|
+
|
22
|
+
describe "#pages" do
|
23
|
+
it "should return page objects" do
|
24
|
+
@scraper.pages.should be_kind_of(Array)
|
25
|
+
@scraper.pages.map(&:uri).sort.should ==
|
26
|
+
["http://merbi.st/plugins/index?page=1","http://merbi.st/plugins/index?page=2"]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should provide #count" do
|
31
|
+
@scraper.should respond_to(:count)
|
32
|
+
end
|
33
|
+
|
34
|
+
describe "#count" do
|
35
|
+
it "should return sum of page counts" do
|
36
|
+
@scraper.count.should == 36
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should provide #names" do
|
41
|
+
@scraper.should respond_to(:names)
|
42
|
+
end
|
43
|
+
|
44
|
+
describe "#names" do
|
45
|
+
it "should return same value as Plugin" do
|
46
|
+
@scraper.names.should == DataMapper::YunkerStar::Scraper::Page.new(Plugin1).names
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should provide #labels" do
|
51
|
+
@scraper.should respond_to(:labels)
|
52
|
+
end
|
53
|
+
|
54
|
+
describe "#labels" do
|
55
|
+
it "should return same value as Plugin" do
|
56
|
+
@scraper.labels.should == DataMapper::YunkerStar::Scraper::Page.new(Plugin1).labels
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
it "should provide #entries" do
|
61
|
+
@scraper.should respond_to(:entries)
|
62
|
+
end
|
63
|
+
|
64
|
+
describe "#entries" do
|
65
|
+
it "should return same value as Plugin" do
|
66
|
+
@scraper.entries.should == (Plugin1.entries + Plugin2.entries)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|