maiha-dm-ys 0.3 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -33,7 +33,7 @@ AUTHOR = "maiha"
33
33
  EMAIL = "maiha@wota.jp"
34
34
  HOMEPAGE = "http://github.com/maiha/dm-ys"
35
35
  SUMMARY = "a DataMapper extension that uses html table as its schema and data powerfully like YunkerStar"
36
- GEM_VERSION = "0.3"
36
+ GEM_VERSION = "0.3.1"
37
37
 
38
38
  spec = Gem::Specification.new do |s|
39
39
  # s.rubyforge_project = 'merb'
@@ -4,6 +4,7 @@ require 'dm-core'
4
4
  require 'dsl_accessor'
5
5
 
6
6
  __DIR__ = File.dirname(__FILE__)
7
+ require __DIR__ + '/dm-ys/config'
7
8
  require __DIR__ + '/dm-ys/base'
8
9
  require __DIR__ + '/dm-ys/cached_accessor'
9
10
  require __DIR__ + '/dm-ys/memory_repository'
@@ -0,0 +1,30 @@
1
+ module DataMapper
2
+ module YunkerStar
3
+ class Config
4
+ def self.default
5
+ {:max_pages=>100, :uniq=>true}
6
+ end
7
+
8
+ def initialize(options = nil)
9
+ @options = options
10
+ @options = self.class.default unless @options.is_a?(Hash)
11
+ end
12
+
13
+ def [](key)
14
+ @options[key]
15
+ end
16
+
17
+ def []=(key, val)
18
+ @options[key] = val
19
+ end
20
+
21
+ def uniq_page?
22
+ !!self[:uniq]
23
+ end
24
+
25
+ def uniq_entry?
26
+ self[:uniq] == true or self[:uniq] == :entry
27
+ end
28
+ end
29
+ end
30
+ end
@@ -18,10 +18,13 @@ module DataMapper
18
18
  dsl_accessor :table
19
19
  dsl_accessor :tbody
20
20
  dsl_accessor :thead
21
+ dsl_accessor :ys, :default=>proc{|*a| DataMapper::YunkerStar::Config.new}
21
22
  property :id, DataMapper::Types::Serial
22
23
  end
23
24
  end
24
25
 
26
+ class MaxPagesOverflow < RuntimeError; end
27
+
25
28
  module ClassMethods
26
29
  def proxy
27
30
  @proxy ||= Scraper.load(self)
@@ -7,6 +7,7 @@ module DataMapper
7
7
  require 'nkf'
8
8
  require 'open-uri'
9
9
  require 'hpricot'
10
+ require 'digest/sha1'
10
11
 
11
12
  module Scraper
12
13
  class TableNotFound < RuntimeError; end
@@ -59,6 +60,10 @@ module DataMapper
59
60
  define_method(method) {raise NotImplementedError, method.to_s}
60
61
  end
61
62
 
63
+ def count
64
+ entries.size
65
+ end
66
+
62
67
  def uri
63
68
  @uri || @model.uri.to_s.chomp('*')
64
69
  end
@@ -99,11 +104,16 @@ module DataMapper
99
104
  attrs = [
100
105
  [ :html, "#{html.size}bytes" ],
101
106
  [ :names, names ],
102
- [ :entries, entries.size ],
107
+ [ :entries, count ],
103
108
  ]
104
109
  "#<#{self.class.name} #{attrs.map { |(k,v)| "@#{k}=#{v.inspect}" } * ' '}>"
105
110
  end
106
111
 
112
+ def page_hash
113
+ body = entries.flatten.join("\t")
114
+ Digest::SHA1.hexdigest(body)
115
+ end
116
+
107
117
  cached_accessor do
108
118
  doc {Hpricot(@html)}
109
119
  table {specified(:table) or guess_table}
@@ -112,7 +122,6 @@ module DataMapper
112
122
  names {labels.map{|i| label2name(i)}}
113
123
  labels {thead.search("> tr").first.search("> td|th").map{|i|strip_tags(i.inner_html)}}
114
124
  entries {tbody.search("> tr").map{|tr| tr.search("> td").map{|i|strip_tags(i.inner_html)}}.delete_if{|i|i.blank?}}
115
- count {entries.size}
116
125
  end
117
126
 
118
127
  private
@@ -178,10 +187,6 @@ module DataMapper
178
187
  @pages ||= execute
179
188
  end
180
189
 
181
- def count
182
- pages.map(&:count).inject(0){|i,v| i+v}
183
- end
184
-
185
190
  def names
186
191
  pages.first.names
187
192
  end
@@ -191,33 +196,66 @@ module DataMapper
191
196
  end
192
197
 
193
198
  def entries
194
- pages.inject([]){|a,p| a+p.entries}
199
+ records = []
200
+ digests = Set.new
201
+ pages.each do |page|
202
+ page.entries.each do |entry|
203
+ if config.uniq_entry?
204
+ sha1 = Digest::SHA1.hexdigest(entry.join("\t"))
205
+ next if digests.include?(sha1)
206
+ digests << sha1
207
+ end
208
+ records << entry
209
+ end
210
+ end
211
+ return records
195
212
  end
196
213
 
197
214
  private
198
215
  def execute
199
216
  visit(uri)
200
- valid_pages
217
+ uniq_pages
218
+ end
219
+
220
+ def config
221
+ @model.ys
201
222
  end
202
223
 
203
- def valid_pages
204
- loaded_pages.values.compact
224
+ def uniq_pages
225
+ return loaded_pages unless config.uniq_page?
226
+
227
+ digests = Set.new
228
+ loaded_pages.select do |page|
229
+ sha1 = page.page_hash
230
+ if digests.include?(sha1)
231
+ false
232
+ else
233
+ digests << sha1
234
+ true
235
+ end
236
+ end
205
237
  end
206
238
 
207
239
  def loaded_pages
208
- @loaded_pages ||= {} # url => page object
240
+ loaded_pages_hash.values.compact
241
+ end
242
+
243
+ def loaded_pages_hash
244
+ @loaded_pages_hash ||= {} # url => page object
209
245
  end
210
246
 
211
- def visit(uri)
212
- return if loaded_pages[uri]
247
+ def visit(uri, runtime_options = {:count => 0})
248
+ return if loaded_pages_hash[uri]
249
+ raise Proxy::MaxPagesOverflow if (runtime_options[:count]+=1) > @model.ys[:max_pages]
250
+
213
251
  page = Page.new(@model, uri)
214
- base = valid_pages.first
252
+ base = loaded_pages.first
215
253
  if !base or base.names == page.names
216
- loaded_pages[uri] = page
254
+ loaded_pages_hash[uri] = page
217
255
  else
218
- loaded_pages[uri] = nil
256
+ loaded_pages_hash[uri] = nil
219
257
  end
220
- page.pagination_links.each{|uri| visit(uri)}
258
+ page.pagination_links.each{|uri| visit(uri, runtime_options)}
221
259
  end
222
260
  end
223
261
 
@@ -66,4 +66,56 @@ describe DataMapper::YunkerStar::Scraper::Composite do
66
66
  @scraper.entries.should == (Plugin1.entries + Plugin2.entries)
67
67
  end
68
68
  end
69
+
70
+ describe "SortedPlugin" do
71
+ it "should return 6 pages" do
72
+ SortedPlugin.proxy.pages.size.should == 6
73
+ end
74
+
75
+ describe "#entries" do
76
+ it "should return duplicate entries" do
77
+ SortedPlugin.entries.sort.should == ((SortedPlugin1.entries + SortedPlugin2.entries)*3).sort
78
+ end
79
+ end
80
+
81
+ describe "#count" do
82
+ it "should return duplicate entries" do
83
+ SortedPlugin.count.should == (SortedPlugin1.count + SortedPlugin2.count)*3
84
+ end
85
+ end
86
+ end
87
+
88
+ describe "SortedPlugin with uniq page option" do
89
+ it "should return 2 pages" do
90
+ SortedPluginWithUniqPage.proxy.pages.size.should == 2
91
+ end
92
+
93
+ describe "#entries" do
94
+ it "should return same value as Plugin" do
95
+ SortedPluginWithUniqPage.entries.should == (SortedPlugin1.entries + SortedPlugin2.entries)
96
+ end
97
+ end
98
+
99
+ describe "#count" do
100
+ it "should return same value as Plugin" do
101
+ SortedPluginWithUniqPage.count.should == (SortedPlugin1.count + SortedPlugin2.count)
102
+ end
103
+ end
104
+ end
105
+
106
+ describe "UniqPlugin" do
107
+ it "should return 2 pages" do
108
+ UniqPlugin.proxy.pages.size.should == 2
109
+ end
110
+
111
+ describe "#count" do
112
+ it "should return same value as Plugin" do
113
+ UniqPlugin1.count.should == 20
114
+ UniqPlugin2.count.should == 4
115
+ UniqPlugin .count.should == 22
116
+ end
117
+ end
118
+ end
119
+
120
+
69
121
  end
@@ -0,0 +1,68 @@
1
+ require File.join( File.dirname(__FILE__), "spec_helper" )
2
+
3
+ describe DataMapper::YunkerStar::Config do
4
+ before(:each) do
5
+ @config = DataMapper::YunkerStar::Config.new
6
+ end
7
+
8
+ it "should provide []" do
9
+ @config.should respond_to(:[])
10
+ end
11
+
12
+ describe "[:uniq]" do
13
+ it "should has true as default value" do
14
+ @config[:uniq].should == true
15
+ end
16
+ end
17
+
18
+ it "should provide #uniq_page?" do
19
+ @config.should respond_to(:uniq_page?)
20
+ end
21
+
22
+ describe "#uniq_page?" do
23
+ it "should return true as default value" do
24
+ @config.uniq_page?.should == true
25
+ end
26
+
27
+ it "should return true when :uniq is set to :page" do
28
+ config = DataMapper::YunkerStar::Config.new(:uniq=>:page)
29
+ config.uniq_page?.should == true
30
+ end
31
+
32
+ it "should return true when :uniq is set to :entry" do
33
+ config = DataMapper::YunkerStar::Config.new(:uniq=>:entry)
34
+ config.uniq_page?.should == true
35
+ end
36
+
37
+ it "should return false when :uniq is set to false" do
38
+ config = DataMapper::YunkerStar::Config.new(:uniq=>false)
39
+ config.uniq_page?.should == false
40
+ end
41
+ end
42
+
43
+ it "should provide #uniq_entry?" do
44
+ @config.should respond_to(:uniq_entry?)
45
+ end
46
+
47
+ describe "#uniq_entry?" do
48
+ it "should return true as default value" do
49
+ @config.uniq_entry?.should == true
50
+ end
51
+
52
+ it "should return false when :uniq is set to :page" do
53
+ config = DataMapper::YunkerStar::Config.new(:uniq=>:page)
54
+ config.uniq_entry?.should == false
55
+ end
56
+
57
+ it "should return true when :uniq is set to :entry" do
58
+ config = DataMapper::YunkerStar::Config.new(:uniq=>:entry)
59
+ config.uniq_entry?.should == true
60
+ end
61
+
62
+ it "should return false when :uniq is set to :page" do
63
+ config = DataMapper::YunkerStar::Config.new(:uniq=>false)
64
+ config.uniq_entry?.should == false
65
+ end
66
+
67
+ end
68
+ end
@@ -0,0 +1,262 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
2
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-us" lang="en-us">
3
+ <head>
4
+ <title>Merbist Plugins</title>
5
+ <meta http-equiv="content-type" content="text/html; charset=utf-8" />
6
+ <meta name="verify-v1"
7
+ content="QqJ1Kmvs51kF+1Sn+7JUqkXTXbnmLVKzFctoGLRDLE8=" />
8
+ <link rel="stylesheet" href="/stylesheets/master.css" type="text/css"
9
+ media="screen" charset="utf-8" />
10
+ <script src="/javascripts/jquery.js" type="text/javascript"></script>
11
+ <link rel="alternate" type="application/rss+xml" title="Atom" href="/plugins.atom"></link>
12
+
13
+ </head>
14
+ <body>
15
+ <div id="base">
16
+ <div id="header">
17
+ <div id="navi">
18
+ <a href="/">Top</a>
19
+ <a href="/plugins">Plugins</a>
20
+ <a href="/talks">Talks</a>
21
+ <a href="/sites">Sites</a>
22
+ <a href="/users">Users</a>
23
+ </div>
24
+ <div id="menu">
25
+ <a href="/login">Login</a>
26
+ <a href="/users/new">Sing up</a>
27
+ </div>
28
+ <div class="clear"><!----></div>
29
+
30
+ </div>
31
+ <div id="body">
32
+ <div class="command">
33
+ <a href="/plugins">Index</a>
34
+ </div>
35
+
36
+
37
+ <h1>Plugin List</h1>
38
+
39
+ <table class="plugin-list" cellspacing="1" border="0">
40
+ <thead>
41
+ <tr>
42
+ <th>
43
+ <a href="/plugins/sorted?page=1&sort=name1">▲</a>
44
+ Name
45
+ <a href="/plugins/sorted?page=1&sort=name2">▼</a
46
+ </th>
47
+ <th>Repos</th>
48
+ <th>Registered by</th>
49
+ <th>Description</th>
50
+ <th></th>
51
+ </tr>
52
+ </thead>
53
+ <tbody>
54
+ <tr class="even">
55
+ <td><a href="/plugins/36">eventmachine-0.12.5</a></td>
56
+ <td>&dagger;</td>
57
+ <td><a href="/users/1">genki</a></td>
58
+ <td>EventMachine: fast, simple event-processing library for Ruby programs</td>
59
+ <td>
60
+ </td>
61
+ </tr>
62
+ <tr class="">
63
+ <td><a href="/plugins/35">dm-last-0.0.1</a></td>
64
+ <td>&dagger;</td>
65
+ <td><a href="/users/1">genki</a></td>
66
+ <td>A DataMapper plugin that offers a short-hand for Model.all.last as Model.last</td>
67
+ <td>
68
+ </td>
69
+ </tr>
70
+ <tr class="even">
71
+ <td><a href="/plugins/34">sweet_merb_fixtures-0.0.4</a></td>
72
+ <td>&dagger;</td>
73
+ <td><a href="/users/1">genki</a></td>
74
+ <td>The way to store records from YAML file.</td>
75
+ <td>
76
+ </td>
77
+ </tr>
78
+ <tr class="">
79
+ <td><a href="/plugins/33">merb-slices-search-path-fix-0.0.1</a></td>
80
+ <td>&dagger;</td>
81
+ <td><a href="/users/1">genki</a></td>
82
+ <td>A merb plugin that fixes </td>
83
+ <td>
84
+ </td>
85
+ </tr>
86
+ <tr class="even">
87
+ <td><a href="/plugins/32">rubyforge-1.0.2</a></td>
88
+ <td>&dagger;</td>
89
+ <td><a href="/users/1">genki</a></td>
90
+ <td>A script which automates a limited set of rubyforge operations.
91
+ </td>
92
+ <td>
93
+ </td>
94
+ </tr>
95
+ <tr class="">
96
+ <td><a href="/plugins/31">ruby-git</a></td>
97
+ <td>&dagger;</td>
98
+ <td><a href="/users/1">genki</a></td>
99
+ <td>Ruby/Git is a Ruby library that can be used to create, read and manipulate Git repositories by wrapping system calls to the git binary.</td>
100
+ <td>
101
+ </td>
102
+ </tr>
103
+ <tr class="even">
104
+ <td><a href="/plugins/30">fastthread-1.0.1</a></td>
105
+ <td>&dagger;</td>
106
+ <td><a href="/users/1">genki</a></td>
107
+ <td>fastthread that is compatible to ruby191</td>
108
+ <td>
109
+ </td>
110
+ </tr>
111
+ <tr class="">
112
+ <td><a href="/plugins/29">mime-types-1.15.1</a></td>
113
+ <td>&dagger;</td>
114
+ <td><a href="/users/1">genki</a></td>
115
+ <td>This library allows for the identification of a file’s likely MIME content type.</td>
116
+ <td>
117
+ </td>
118
+ </tr>
119
+ <tr class="even">
120
+ <td><a href="/plugins/28">memcached-0.14</a></td>
121
+ <td>&dagger;</td>
122
+ <td><a href="/users/1">genki</a></td>
123
+ <td>A Ruby interface to the libmemcached C client</td>
124
+ <td>
125
+ </td>
126
+ </tr>
127
+ <tr class="">
128
+ <td><a href="/plugins/27">gem_plugin-0.2.3</a></td>
129
+ <td>&dagger;</td>
130
+ <td><a href="/users/1">genki</a></td>
131
+ <td>Gem Based Plugin System</td>
132
+ <td>
133
+ </td>
134
+ </tr>
135
+ <tr class="even">
136
+ <td><a href="/plugins/26">cgi_multipart_eof_fix-2.5.0</a></td>
137
+ <td>&dagger;</td>
138
+ <td><a href="/users/1">genki</a></td>
139
+ <td>cgi_multipart_eof_fix on github</td>
140
+ <td>
141
+ </td>
142
+ </tr>
143
+ <tr class="">
144
+ <td><a href="/plugins/25">mongrel-1.1.2</a></td>
145
+ <td>&dagger;</td>
146
+ <td><a href="/users/1">genki</a></td>
147
+ <td>Mongrel</td>
148
+ <td>
149
+ </td>
150
+ </tr>
151
+ <tr class="even">
152
+ <td><a href="/plugins/24">hpricot-0.6.207</a></td>
153
+ <td>&dagger;</td>
154
+ <td><a href="/users/1">genki</a></td>
155
+ <td>A swift, liberal HTML parser with a fantastic library</td>
156
+ <td>
157
+ </td>
158
+ </tr>
159
+ <tr class="">
160
+ <td><a href="/plugins/23">methopara-0.3.0</a></td>
161
+ <td>&dagger;</td>
162
+ <td><a href="/users/1">genki</a></td>
163
+ <td>Method#parameters for ruby-1.9.1</td>
164
+ <td>
165
+ </td>
166
+ </tr>
167
+ <tr class="even">
168
+ <td><a href="/plugins/22">irb_rocket-0.1.3</a></td>
169
+ <td>&dagger;</td>
170
+ <td><a href="/users/1">genki</a></td>
171
+ <td>irb plugin that makes irb #=&gt; rocket</td>
172
+ <td>
173
+ </td>
174
+ </tr>
175
+ <tr class="">
176
+ <td><a href="/plugins/21">ruby-terminfo-0.1.1</a></td>
177
+ <td>&dagger;</td>
178
+ <td><a href="/users/1">genki</a></td>
179
+ <td>ruby-terminfo is a terminfo binding for Ruby</td>
180
+ <td>
181
+ </td>
182
+ </tr>
183
+ <tr class="even">
184
+ <td><a href="/plugins/20">extlib-present-0.0.2</a></td>
185
+ <td>&dagger;</td>
186
+ <td><a href="/users/1">genki</a></td>
187
+ <td>A plugin that provides Object#present?</td>
188
+ <td>
189
+ </td>
190
+ </tr>
191
+ <tr class="">
192
+ <td><a href="/plugins/19">merb_render_filter-0.0.2</a></td>
193
+ <td>&dagger;</td>
194
+ <td><a href="/users/1">genki</a></td>
195
+ <td>A plugin that provides {before|after}_render filters for Controller.</td>
196
+ <td>
197
+ </td>
198
+ </tr>
199
+ <tr class="even">
200
+ <td><a href="/plugins/18">bcrypt-ruby-2.0.3</a></td>
201
+ <td>&dagger;</td>
202
+ <td><a href="/users/1">genki</a></td>
203
+ <td>bcrypt-ruby is a Ruby binding for the OpenBSD bcrypt() password hashing algorithm, allowing you to easily store a secure hash of your users' passwords.
204
+ </td>
205
+ <td>
206
+ </td>
207
+ </tr>
208
+ <tr class="">
209
+ <td><a href="/plugins/17">json-1.1.4.1</a></td>
210
+ <td>&dagger;</td>
211
+ <td><a href="/users/1">genki</a></td>
212
+ <td>This library can parse JSON texts and generate them from ruby data structures. Install the extension variant (in C) with &quot;gem install json&quot; or install the pure Ruby variant with &quot;gem install json_pure&quot;.</td>
213
+ <td>
214
+ </td>
215
+ </tr>
216
+ </tbody>
217
+ </table>
218
+
219
+ <div class="pagination"><span class="prev disabled">&laquo; Prev</span>
220
+ <span class="current disabled">1</span>
221
+ <span class="disabled"><a href="/plugins/sorted?page=2">2</a></span>
222
+ <a class="next" rel="next" href="/plugins/sorted?page=2">Next &raquo;</a></div>
223
+
224
+ <div class="footnote">
225
+ &dagger; You can install these gems by
226
+ <code>
227
+ gem install gem-name --source http://merbi.st
228
+ </code>
229
+ </div>
230
+
231
+
232
+ </div>
233
+ <div id="footer">
234
+ <div id="footer">
235
+ 2008
236
+ <a href="http://wota.jp/ac/">maiha</a>,
237
+ <a href="http://d.jong.gr.jp/shachi">shachi</a>
238
+ and
239
+ <a href="http://blog.s21g.com/genki">genki</a>
240
+ (<a href="http://www.s21g.com/">s21g LLC</a>).
241
+ <span class="powered-by">
242
+ Powered by
243
+ Merb-1.0.9
244
+ (Ruby-1.9.1)
245
+ </span>
246
+ </div>
247
+
248
+ </div>
249
+ </div>
250
+
251
+ <script type="text/javascript">
252
+ var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
253
+ document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
254
+ </script>
255
+ <script type="text/javascript">
256
+ try {
257
+ var pageTracker = _gat._getTracker("UA-2733799-11");
258
+ pageTracker._trackPageview();
259
+ } catch(err) {}</script>
260
+
261
+ </body>
262
+ </html>