maiha-dm-ys 0.2.2 → 0.3

Sign up to get free protection for your applications and to get access to all the features.
data/README CHANGED
@@ -24,4 +24,29 @@ Example
24
24
  => ["Gem", "Maintainer", "Name"]
25
25
 
26
26
 
27
+ Pagination
28
+ ==========
29
+
30
+ Special syntax "uri*" means pagination mode that enables recursive retrieving.
31
+
32
+ class Plugin
33
+ include DataMapper::YunkerStar
34
+ uri "http://merbi.st/plugins/index?page=1"
35
+ end
36
+
37
+ irb(main):001:0> Plugin.count
38
+ => 20
39
+
40
+ This parses only specified uri.
41
+ Append "*" to uri if you want pagination mode.
42
+
43
+ class Plugin
44
+ include DataMapper::YunkerStar
45
+ uri "http://merbi.st/plugins/index?page=1*"
46
+ end
47
+
48
+ irb(main):001:0> Plugin.count
49
+ => 36
50
+
51
+
27
52
  Copyright (c) 2008 maiha@wota.jp, released under the MIT license
data/Rakefile CHANGED
@@ -33,7 +33,7 @@ AUTHOR = "maiha"
33
33
  EMAIL = "maiha@wota.jp"
34
34
  HOMEPAGE = "http://github.com/maiha/dm-ys"
35
35
  SUMMARY = "a DataMapper extension that uses html table as its schema and data powerfully like YunkerStar"
36
- GEM_VERSION = "0.2.2"
36
+ GEM_VERSION = "0.3"
37
37
 
38
38
  spec = Gem::Specification.new do |s|
39
39
  # s.rubyforge_project = 'merb'
@@ -26,5 +26,14 @@ module DataMapper
26
26
  def self.descendants
27
27
  @descendants ||= Set.new
28
28
  end
29
+
30
+ # @api public
31
+ def self.[](uri)
32
+ klass = Class.new do
33
+ include DataMapper::YunkerStar
34
+ end
35
+ klass.uri uri
36
+ return klass
37
+ end
29
38
  end
30
39
  end
@@ -24,16 +24,7 @@ module DataMapper
24
24
 
25
25
  module ClassMethods
26
26
  def proxy
27
- @proxy ||= lazy_load
28
- end
29
-
30
- def lazy_load
31
- loader = Scraper.new(self)
32
- loader.names.each do |name|
33
- type = String # TODO
34
- property name.intern, type
35
- end
36
- return loader
27
+ @proxy ||= Scraper.load(self)
37
28
  end
38
29
 
39
30
  def names
@@ -54,9 +45,9 @@ module DataMapper
54
45
 
55
46
  def all
56
47
  count = 0
57
- @all ||= proxy.entries.map{|array|
48
+ @all ||= entries.map{|array|
58
49
  count += 1
59
- new(Hash[*proxy.names.zip(array).flatten].merge(:id=>count))
50
+ new(Hash[*names.zip(array).flatten].merge(:id=>count))
60
51
  }
61
52
  end
62
53
 
@@ -8,96 +8,219 @@ module DataMapper
8
8
  require 'open-uri'
9
9
  require 'hpricot'
10
10
 
11
- class Scraper
12
- include CachedAccessor
13
-
11
+ module Scraper
14
12
  class TableNotFound < RuntimeError; end
15
13
 
16
- attr_reader :html
14
+ def self.paginate?(model)
15
+ model.uri.to_s[-1] == ?*
16
+ end
17
17
 
18
- def initialize(model)
19
- raise ArgumentError, "missing model" unless model
20
- raise ArgumentError, "missing uri" unless model.uri
21
- @model = model
22
- @html = NKF.nkf('-w', open(model.uri).read)
23
- @invalid_name_count = 0
18
+ def self.lookup(model)
19
+ scraper = paginate?(model) ? Composite : Page
20
+ scraper.new(model)
24
21
  end
25
22
 
26
- def guess_table
27
- [max_table_from("table"), max_table_from("table > tbody")].sort_by(&:first).last.last or
28
- raise TableNotFound, "set 'table' or 'tbody' manually"
23
+ def self.load(model)
24
+ loader = lookup(model)
25
+ loader.register_properties!
26
+ return loader
29
27
  end
30
28
 
31
- cached_accessor do
32
- doc {Hpricot(@html)}
33
- table {specified(:table) or guess_table}
34
- thead {specified(:thead) or table.search("> thead").first or table}
35
- tbody {specified(:tbody) or table.search("> tbody").first or table}
36
- names {labels.map{|i| label2name(i)}}
37
- labels {thead.search("> tr").first.search("> td|th").map{|i|strip_tags(i.inner_html)}}
38
- entries {tbody.search("> tr").map{|tr| tr.search("> td").map{|i|strip_tags(i.inner_html)}}.delete_if{|i|i.blank?}}
29
+ ######################################################################
30
+ ### Utils
31
+
32
+ module Utils
33
+ def constantize(label)
34
+ require 'cgi'
35
+ label = CGI.unescapeHTML(label.to_s)
36
+ label.gsub!(/&[a-z]+;/, '')
37
+ label.gsub!(/\r?\n/, '')
38
+ label.gsub!(/\s+/,'')
39
+ label.delete!('!"#$%&()=~|`{}^-[]/<>:;,.\\-')
40
+ label.delete!("'")
41
+ return label
42
+ end
43
+
44
+ module_function :constantize
39
45
  end
40
46
 
41
- private
47
+ ######################################################################
48
+ ### Base Scraper
42
49
 
43
- def max_table_from(entry)
44
- table = nil
45
- count = -1
46
- doc.search(entry).each do |t|
47
- size = t.search("> tr").size
48
- if size > count
49
- count = size
50
- table = t
51
- end
52
- end
53
- [count, table]
50
+ class Base
51
+ include CachedAccessor
52
+
53
+ def initialize(model, *args)
54
+ raise ArgumentError, "missing model" unless model
55
+ raise ArgumentError, "missing uri" unless model.uri
56
+ @model = model
57
+ end
58
+ [:names, :labels, :entries].each do |method|
59
+ define_method(method) {raise NotImplementedError, method.to_s}
54
60
  end
55
61
 
56
- def specified(name)
57
- @model.respond_to?(name) or raise ArgumentError, "invalid selector name: #{name}"
58
- css = @model.__send__(name) or return nil
59
-
60
- element = doc.search(css)
61
- case element
62
- when Hpricot::Elem
63
- return element
64
- when Hpricot::Elements
65
- return element.first
66
- else
67
- return nil
62
+ def uri
63
+ @uri || @model.uri.to_s.chomp('*')
64
+ end
65
+
66
+ def register_properties!
67
+ names.each do |name|
68
+ type = String # TODO
69
+ @model.property name.intern, type
68
70
  end
69
71
  end
72
+ end
70
73
 
71
- def label2name(label)
72
- require 'cgi'
73
- label = CGI.unescapeHTML(label)
74
- label.gsub!(/&nbsp;/, '')
75
- label.gsub!(/\r?\n/, '')
76
- label.delete!('!"#$%&()=~|`{}^-[]/<>:; \\')
77
- label.delete!("'")
78
- label.strip!
74
+ ######################################################################
75
+ ### Page Scraper
76
+
77
+ class Page < Base
78
+ attr_reader :html
79
+
80
+ def initialize(model, uri = nil)
81
+ super
82
+ @uri = uri
83
+ @html = NKF.nkf('-w', open(self.uri).read)
84
+ @invalid_name_count = 0
85
+ end
86
+
87
+ def guess_table
88
+ max_table or
89
+ raise TableNotFound, "set 'table' or 'tbody' manually"
90
+ end
91
+
92
+ def pagination_links
93
+ base = URI.parse(uri.split('?').first)
94
+ urls = (doc / "a").map{|i| i[:href] =~ /^http/ ? i[:href] : (base+i[:href]).to_s}.uniq
95
+ urls.select{|url| /^#{Regexp.escape(base.to_s)}/ === url}
96
+ end
97
+
98
+ def inspect
99
+ attrs = [
100
+ [ :html, "#{html.size}bytes" ],
101
+ [ :names, names ],
102
+ [ :entries, entries.size ],
103
+ ]
104
+ "#<#{self.class.name} #{attrs.map { |(k,v)| "@#{k}=#{v.inspect}" } * ' '}>"
105
+ end
106
+
107
+ cached_accessor do
108
+ doc {Hpricot(@html)}
109
+ table {specified(:table) or guess_table}
110
+ thead {specified(:thead) or table.search("> thead").first or table}
111
+ tbody {specified(:tbody) or table.search("> tbody").first or table}
112
+ names {labels.map{|i| label2name(i)}}
113
+ labels {thead.search("> tr").first.search("> td|th").map{|i|strip_tags(i.inner_html)}}
114
+ entries {tbody.search("> tr").map{|tr| tr.search("> td").map{|i|strip_tags(i.inner_html)}}.delete_if{|i|i.blank?}}
115
+ count {entries.size}
116
+ end
79
117
 
80
- if /^([A-Z])/ === label and Object.const_defined?(label)
81
- label = "_#{label}"
118
+ private
119
+
120
+ def max_table
121
+ table = nil
122
+ count = -1
123
+ doc.search("table").each do |t|
124
+ size = [t.search("> tr").size, t.search("> tbody > tr").size].max
125
+ if size > count
126
+ count = size
127
+ table = t
128
+ end
129
+ end
130
+ return table
131
+ end
132
+
133
+ def specified(name)
134
+ @model.respond_to?(name) or raise ArgumentError, "invalid selector name: #{name}"
135
+ css = @model.__send__(name) or return nil
136
+
137
+ element = doc.search(css)
138
+ case element
139
+ when Hpricot::Elem
140
+ return element
141
+ when Hpricot::Elements
142
+ return element.first
143
+ else
144
+ return nil
145
+ end
82
146
  end
83
- if label.blank? or @model.respond_to?(label, true)
84
- new_name_for(label)
85
- elsif /^[0-9]/ === label
86
- "_#{label}"
87
- else
88
- label
147
+
148
+ def label2name(label)
149
+ label = Utils.constantize(label)
150
+
151
+ if /^([A-Z])/ === label and Object.const_defined?(label)
152
+ label = "_#{label}"
153
+ end
154
+ if label.blank? or @model.respond_to?(label, true)
155
+ new_name_for(label)
156
+ elsif /^[0-9]/ === label
157
+ "_#{label}"
158
+ else
159
+ label
160
+ end
89
161
  end
162
+
163
+ def new_name_for(label)
164
+ @invalid_name_count += 1
165
+ "col_#{@invalid_name_count}"
166
+ end
167
+
168
+ def strip_tags(html)
169
+ html.gsub(/<.*?>/, '').strip
170
+ end
171
+ end
172
+
173
+ ######################################################################
174
+ ### Composite Scraper
175
+
176
+ class Composite < Base
177
+ def pages
178
+ @pages ||= execute
90
179
  end
91
180
 
92
- def new_name_for(label)
93
- @invalid_name_count += 1
94
- "col_#{@invalid_name_count}"
181
+ def count
182
+ pages.map(&:count).inject(0){|i,v| i+v}
95
183
  end
96
184
 
97
- def strip_tags(html)
98
- html.gsub(/<.*?>/, '').strip
185
+ def names
186
+ pages.first.names
99
187
  end
100
188
 
189
+ def labels
190
+ pages.first.labels
191
+ end
192
+
193
+ def entries
194
+ pages.inject([]){|a,p| a+p.entries}
195
+ end
196
+
197
+ private
198
+ def execute
199
+ visit(uri)
200
+ valid_pages
201
+ end
202
+
203
+ def valid_pages
204
+ loaded_pages.values.compact
205
+ end
206
+
207
+ def loaded_pages
208
+ @loaded_pages ||= {} # url => page object
209
+ end
210
+
211
+ def visit(uri)
212
+ return if loaded_pages[uri]
213
+ page = Page.new(@model, uri)
214
+ base = valid_pages.first
215
+ if !base or base.names == page.names
216
+ loaded_pages[uri] = page
217
+ else
218
+ loaded_pages[uri] = nil
219
+ end
220
+ page.pagination_links.each{|uri| visit(uri)}
221
+ end
222
+ end
223
+
101
224
  end
102
225
  end
103
226
  end
@@ -0,0 +1,26 @@
1
+ require File.join( File.dirname(__FILE__), "spec_helper" )
2
+
3
+ describe DataMapper::YunkerStar do
4
+ it "should provide []" do
5
+ DataMapper::YunkerStar.should respond_to(:[])
6
+ end
7
+
8
+ describe "[]" do
9
+ before(:each) do
10
+ @uri = "http://merbi.st/plugins/index?page=1"
11
+ @ys = DataMapper::YunkerStar[@uri]
12
+ end
13
+
14
+ it "should return a new class" do
15
+ @ys.should be_kind_of(Class)
16
+ end
17
+
18
+ it "should include DataMapper::YunkerStar" do
19
+ @ys.ancestors.should be_include(DataMapper::YunkerStar)
20
+ end
21
+
22
+ it "should set uri" do
23
+ @ys.uri.should == @uri
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,69 @@
1
+ require File.join( File.dirname(__FILE__), "spec_helper" )
2
+
3
+ describe DataMapper::YunkerStar::Scraper::Composite do
4
+ before(:each) do
5
+ @scraper = DataMapper::YunkerStar::Scraper::Composite.new(Plugin)
6
+ end
7
+
8
+ it "should provide #uri" do
9
+ @scraper.should respond_to(:uri)
10
+ end
11
+
12
+ describe "#uri" do
13
+ it "should strip last '*'" do
14
+ @scraper.uri.should == "http://merbi.st/plugins/index?page=1"
15
+ end
16
+ end
17
+
18
+ it "should provide #pages" do
19
+ @scraper.should respond_to(:pages)
20
+ end
21
+
22
+ describe "#pages" do
23
+ it "should return page objects" do
24
+ @scraper.pages.should be_kind_of(Array)
25
+ @scraper.pages.map(&:uri).sort.should ==
26
+ ["http://merbi.st/plugins/index?page=1","http://merbi.st/plugins/index?page=2"]
27
+ end
28
+ end
29
+
30
+ it "should provide #count" do
31
+ @scraper.should respond_to(:count)
32
+ end
33
+
34
+ describe "#count" do
35
+ it "should return sum of page counts" do
36
+ @scraper.count.should == 36
37
+ end
38
+ end
39
+
40
+ it "should provide #names" do
41
+ @scraper.should respond_to(:names)
42
+ end
43
+
44
+ describe "#names" do
45
+ it "should return same value as Plugin" do
46
+ @scraper.names.should == DataMapper::YunkerStar::Scraper::Page.new(Plugin1).names
47
+ end
48
+ end
49
+
50
+ it "should provide #labels" do
51
+ @scraper.should respond_to(:labels)
52
+ end
53
+
54
+ describe "#labels" do
55
+ it "should return same value as Plugin" do
56
+ @scraper.labels.should == DataMapper::YunkerStar::Scraper::Page.new(Plugin1).labels
57
+ end
58
+ end
59
+
60
+ it "should provide #entries" do
61
+ @scraper.should respond_to(:entries)
62
+ end
63
+
64
+ describe "#entries" do
65
+ it "should return same value as Plugin" do
66
+ @scraper.entries.should == (Plugin1.entries + Plugin2.entries)
67
+ end
68
+ end
69
+ end