maiha-dm-ys 0.4 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README +24 -1
- data/Rakefile +1 -1
- data/lib/dm-ys/config.rb +5 -1
- data/lib/dm-ys/element_property.rb +47 -0
- data/lib/dm-ys/scraper.rb +25 -5
- data/spec/attribute_spec.rb +42 -0
- data/spec/config_spec.rb +9 -0
- data/spec/data/th.html +108 -0
- data/spec/models/style.rb +24 -0
- data/spec/proxy_spec.rb +0 -21
- data/spec/scraper_spec.rb +6 -0
- data/spec/spec_helper.rb +2 -0
- data/spec/uniq_record_spec.rb +17 -0
- metadata +7 -2
data/README
CHANGED
@@ -64,8 +64,31 @@ Append "*" to uri if you want pagination mode.
|
|
64
64
|
=> 36
|
65
65
|
|
66
66
|
|
67
|
+
Raw Element
|
68
|
+
===========
|
69
|
+
|
70
|
+
"element_for" method returns raw scraped elements those are currently hpricot elements.
|
71
|
+
|
72
|
+
>> Plugin.names #=> ["Name", "Repos", "Registeredby", "Description", "col_1"]
|
73
|
+
>> record = Plugin.first #=> #<Plugin id=1 Name="eventmachine-0.12.5" Repos=...>
|
74
|
+
>> record.Name #=> "eventmachine-0.12.5"
|
75
|
+
>> record.element_for("Name") #=> {elem td {elem a href"/plugins/36" "eventmachine-0.12.5" a} td}
|
76
|
+
>> record.element_for("Name").class #=> Hpricot::Elem
|
77
|
+
|
78
|
+
"link_for" is syntax sugar for extracting href tag from its element.
|
79
|
+
|
80
|
+
>> record.link_for("Name") #=> "http://merbi.st/plugins/36"
|
81
|
+
>> record.link_for("Registeredby") #=> "http://merbi.st/users/1"
|
82
|
+
|
83
|
+
Furthermore, :only_path option can control the fully qualified URL or not.
|
84
|
+
|
85
|
+
>> Plugin.ys[:only_path] = true
|
86
|
+
>> record.link_for("Name") #=> "/plugins/36"
|
87
|
+
>> record.link_for("Registeredby") #=> "/users/1"
|
88
|
+
|
89
|
+
|
67
90
|
TODO
|
68
91
|
====
|
69
|
-
*
|
92
|
+
* Feel free to request what you want! :)
|
70
93
|
|
71
94
|
Copyright (c) 2008 maiha@wota.jp, released under the MIT license
|
data/Rakefile
CHANGED
@@ -33,7 +33,7 @@ AUTHOR = "maiha"
|
|
33
33
|
EMAIL = "maiha@wota.jp"
|
34
34
|
HOMEPAGE = "http://github.com/maiha/dm-ys"
|
35
35
|
SUMMARY = "a DataMapper extension that uses html table as its schema and data powerfully like YunkerStar"
|
36
|
-
GEM_VERSION = "0.4"
|
36
|
+
GEM_VERSION = "0.4.1"
|
37
37
|
|
38
38
|
spec = Gem::Specification.new do |s|
|
39
39
|
# s.rubyforge_project = 'merb'
|
data/lib/dm-ys/config.rb
CHANGED
@@ -2,7 +2,7 @@ module DataMapper
|
|
2
2
|
module YS
|
3
3
|
class Config
|
4
4
|
def self.default
|
5
|
-
{:max_pages=>100, :uniq=>true}
|
5
|
+
{:max_pages=>100, :uniq=>true, :only_path=>false}
|
6
6
|
end
|
7
7
|
|
8
8
|
def initialize(options = nil)
|
@@ -25,6 +25,10 @@ module DataMapper
|
|
25
25
|
def uniq_entry?
|
26
26
|
self[:uniq] == true or self[:uniq] == :entry
|
27
27
|
end
|
28
|
+
|
29
|
+
def only_path?
|
30
|
+
!!self[:only_path]
|
31
|
+
end
|
28
32
|
end
|
29
33
|
end
|
30
34
|
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module DataMapper
|
2
|
+
module YS
|
3
|
+
|
4
|
+
# ==== Example
|
5
|
+
#
|
6
|
+
# Class Foo
|
7
|
+
# include DataMapper::YS
|
8
|
+
# uri ...
|
9
|
+
#
|
10
|
+
# # <tr><th>name</th>...
|
11
|
+
# # <tr><td><a href="/plugins/36">dm-ys</a></td>...
|
12
|
+
#
|
13
|
+
# foo = Foo.first
|
14
|
+
# foo.link_for(:name) # => "/plugins/36"
|
15
|
+
|
16
|
+
module ElementProperty
|
17
|
+
def link_for(key)
|
18
|
+
links_for(key).first
|
19
|
+
end
|
20
|
+
|
21
|
+
def links_for(key)
|
22
|
+
key = normalized_property_for(key)
|
23
|
+
(@links[key.to_s] || []).map do |url|
|
24
|
+
if self.class.ys.only_path?
|
25
|
+
url
|
26
|
+
else
|
27
|
+
(self.class.proxy.base_uri + url).to_s
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def element_for(key)
|
33
|
+
key = normalized_property_for(key)
|
34
|
+
@elements[key.to_s]
|
35
|
+
end
|
36
|
+
|
37
|
+
def links=(value)
|
38
|
+
@links = value
|
39
|
+
end
|
40
|
+
|
41
|
+
def elements=(value)
|
42
|
+
@elements = value
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
data/lib/dm-ys/scraper.rb
CHANGED
@@ -68,6 +68,10 @@ module DataMapper
|
|
68
68
|
@uri || @model.uri.to_s.chomp('*')
|
69
69
|
end
|
70
70
|
|
71
|
+
def base_uri
|
72
|
+
URI.parse(uri.split('?').first)
|
73
|
+
end
|
74
|
+
|
71
75
|
def register_properties!
|
72
76
|
names.each do |name|
|
73
77
|
type = String # TODO
|
@@ -95,7 +99,7 @@ module DataMapper
|
|
95
99
|
end
|
96
100
|
|
97
101
|
def pagination_links
|
98
|
-
base =
|
102
|
+
base = base_uri
|
99
103
|
urls = (doc / "a").map{|i| i[:href] =~ /^http/ ? i[:href] : (base+i[:href]).to_s}.uniq
|
100
104
|
urls.select{|url| /^#{Regexp.escape(base.to_s)}/ === url}
|
101
105
|
end
|
@@ -122,12 +126,17 @@ module DataMapper
|
|
122
126
|
labels {thead.search("> tr").first.search("> td|th").map{|i|strip_tags(i.inner_html)}}
|
123
127
|
records {
|
124
128
|
tbody.search("> tr").map do |tr|
|
125
|
-
elems
|
126
|
-
|
129
|
+
elems = tr.search("> td")
|
130
|
+
next if elems.blank? # ignored because this should be TH columns
|
131
|
+
|
132
|
+
values = elems.map{|i|strip_tags(i.inner_html)}
|
133
|
+
elements = Hash[*names.zip(elems).flatten]
|
134
|
+
|
127
135
|
record = @model.new(Hash[*names.zip(values).flatten])
|
128
|
-
record.elements =
|
136
|
+
record.elements = elements
|
137
|
+
record.links = names.inject({}){|h,name| h[name] = links_for(elements[name]); h}
|
129
138
|
record
|
130
|
-
end
|
139
|
+
end.compact
|
131
140
|
}
|
132
141
|
end
|
133
142
|
|
@@ -184,6 +193,17 @@ module DataMapper
|
|
184
193
|
def strip_tags(html)
|
185
194
|
html.gsub(/<.*?>/, '').strip
|
186
195
|
end
|
196
|
+
|
197
|
+
def links_for(element)
|
198
|
+
case element
|
199
|
+
when Hpricot::Elem
|
200
|
+
return Array(element.search("a")).map{|i| i[:href]}
|
201
|
+
when Hpricot::Elements
|
202
|
+
return element.map{|e| links_for(e)}.flatten
|
203
|
+
else
|
204
|
+
return []
|
205
|
+
end
|
206
|
+
end
|
187
207
|
end
|
188
208
|
|
189
209
|
######################################################################
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require File.join( File.dirname(__FILE__), "spec_helper" )
|
2
|
+
|
3
|
+
describe DataMapper::YS, "(a record)" do
|
4
|
+
class OnlyPath
|
5
|
+
include DataMapper::YS
|
6
|
+
uri "http://merbi.st/plugins/"
|
7
|
+
ys[:only_path] = true
|
8
|
+
end
|
9
|
+
|
10
|
+
before(:each) do
|
11
|
+
@record = Plugin.first
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should provide #link_for" do
|
15
|
+
@record.should respond_to(:link_for)
|
16
|
+
end
|
17
|
+
|
18
|
+
describe "#link_for" do
|
19
|
+
it "should return first link if its element has href attributes" do
|
20
|
+
@record.link_for("Name").should == "http://merbi.st/plugins/36"
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should return nil if its element has no href attributes" do
|
24
|
+
@record.link_for("Description").should == nil
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should return only path link when :only_path is true" do
|
28
|
+
OnlyPath.first.link_for("Name").should == "/plugins/36"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should provide #element_for" do
|
33
|
+
@record.should respond_to(:element_for)
|
34
|
+
end
|
35
|
+
|
36
|
+
describe "#element_for" do
|
37
|
+
it "should return first link if its element has href attributes" do
|
38
|
+
@record.element_for("Name").to_s.should == "<td><a href=\"/plugins/36\">eventmachine-0.12.5</a></td>"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
data/spec/config_spec.rb
CHANGED
@@ -63,6 +63,15 @@ describe DataMapper::YS::Config do
|
|
63
63
|
config = DataMapper::YS::Config.new(:uniq=>false)
|
64
64
|
config.uniq_entry?.should == false
|
65
65
|
end
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should provide #only_path?" do
|
69
|
+
@config.should respond_to(:only_path?)
|
70
|
+
end
|
66
71
|
|
72
|
+
describe "[:only_path]" do
|
73
|
+
it "should has false as default value" do
|
74
|
+
@config[:only_path].should == false
|
75
|
+
end
|
67
76
|
end
|
68
77
|
end
|
data/spec/data/th.html
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
2
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-us" lang="en-us">
|
3
|
+
<head>
|
4
|
+
<title>Merbist Plugins</title>
|
5
|
+
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
|
6
|
+
<meta name="verify-v1"
|
7
|
+
content="QqJ1Kmvs51kF+1Sn+7JUqkXTXbnmLVKzFctoGLRDLE8=" />
|
8
|
+
<link rel="stylesheet" href="/stylesheets/master.css" type="text/css"
|
9
|
+
media="screen" charset="utf-8" />
|
10
|
+
<script src="/javascripts/jquery.js" type="text/javascript"></script>
|
11
|
+
<link rel="alternate" type="application/rss+xml" title="Atom" href="/plugins.atom"></link>
|
12
|
+
|
13
|
+
</head>
|
14
|
+
<body>
|
15
|
+
<div id="base">
|
16
|
+
<div id="header">
|
17
|
+
<div id="navi">
|
18
|
+
<a href="/">Top</a>
|
19
|
+
<a href="/plugins">Plugins</a>
|
20
|
+
<a href="/talks">Talks</a>
|
21
|
+
<a href="/sites">Sites</a>
|
22
|
+
<a href="/users">Users</a>
|
23
|
+
</div>
|
24
|
+
<div id="menu">
|
25
|
+
<a href="/login">Login</a>
|
26
|
+
<a href="/users/new">Sing up</a>
|
27
|
+
</div>
|
28
|
+
<div class="clear"><!----></div>
|
29
|
+
|
30
|
+
</div>
|
31
|
+
<div id="body">
|
32
|
+
<div class="command">
|
33
|
+
<a href="/plugins">Index</a>
|
34
|
+
</div>
|
35
|
+
|
36
|
+
|
37
|
+
<h1>Plugin List</h1>
|
38
|
+
|
39
|
+
<table class="plugin-list" cellspacing="1" border="0">
|
40
|
+
<tr>
|
41
|
+
<th>Name</th>
|
42
|
+
<th>Repos</th>
|
43
|
+
<th>Registered by</th>
|
44
|
+
<th>Description</th>
|
45
|
+
<th></th>
|
46
|
+
</tr>
|
47
|
+
<tr class="even">
|
48
|
+
<td><a href="/plugins/36">eventmachine-0.12.5</a></td>
|
49
|
+
<td>†</td>
|
50
|
+
<td><a href="/users/1">genki</a></td>
|
51
|
+
<td>EventMachine</td>
|
52
|
+
<td>
|
53
|
+
</td>
|
54
|
+
</tr>
|
55
|
+
<tr class="">
|
56
|
+
<td><a href="/plugins/35">dm-last-0.0.1</a></td>
|
57
|
+
<td>†</td>
|
58
|
+
<td><a href="/users/1">genki</a></td>
|
59
|
+
<td>Model.last</td>
|
60
|
+
<td>
|
61
|
+
</td>
|
62
|
+
</tr>
|
63
|
+
</table>
|
64
|
+
|
65
|
+
<div class="pagination"><span class="prev disabled">« Prev</span>
|
66
|
+
<span class="current disabled">1</span>
|
67
|
+
<span class="disabled"><a href="/plugins/uniq?page=2">2</a></span>
|
68
|
+
<a class="next" rel="next" href="/plugins/uniq?page=2">Next »</a></div>
|
69
|
+
|
70
|
+
<div class="footnote">
|
71
|
+
† You can install these gems by
|
72
|
+
<code>
|
73
|
+
gem install gem-name --source http://merbi.st
|
74
|
+
</code>
|
75
|
+
</div>
|
76
|
+
|
77
|
+
|
78
|
+
</div>
|
79
|
+
<div id="footer">
|
80
|
+
<div id="footer">
|
81
|
+
2008
|
82
|
+
<a href="http://wota.jp/ac/">maiha</a>,
|
83
|
+
<a href="http://d.jong.gr.jp/shachi">shachi</a>
|
84
|
+
and
|
85
|
+
<a href="http://blog.s21g.com/genki">genki</a>
|
86
|
+
(<a href="http://www.s21g.com/">s21g LLC</a>).
|
87
|
+
<span class="powered-by">
|
88
|
+
Powered by
|
89
|
+
Merb-1.0.9
|
90
|
+
(Ruby-1.9.1)
|
91
|
+
</span>
|
92
|
+
</div>
|
93
|
+
|
94
|
+
</div>
|
95
|
+
</div>
|
96
|
+
|
97
|
+
<script type="text/javascript">
|
98
|
+
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
|
99
|
+
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
|
100
|
+
</script>
|
101
|
+
<script type="text/javascript">
|
102
|
+
try {
|
103
|
+
var pageTracker = _gat._getTracker("UA-2733799-11");
|
104
|
+
pageTracker._trackPageview();
|
105
|
+
} catch(err) {}</script>
|
106
|
+
|
107
|
+
</body>
|
108
|
+
</html>
|
@@ -0,0 +1,24 @@
|
|
1
|
+
class BlankHtml
|
2
|
+
include DataMapper::YS
|
3
|
+
uri spec_data_path("blank.html")
|
4
|
+
end
|
5
|
+
|
6
|
+
class BlankStyle
|
7
|
+
include DataMapper::YS
|
8
|
+
uri spec_data_path("plugins1.html")
|
9
|
+
end
|
10
|
+
|
11
|
+
class TableStyle < BlankStyle
|
12
|
+
uri spec_data_path("plugins1.html")
|
13
|
+
table "table.main"
|
14
|
+
end
|
15
|
+
|
16
|
+
class TheadStyle < BlankStyle
|
17
|
+
uri spec_data_path("plugins1.html")
|
18
|
+
thead "table.main"
|
19
|
+
end
|
20
|
+
|
21
|
+
class ThStyle
|
22
|
+
include DataMapper::YS
|
23
|
+
uri spec_data_path("th.html")
|
24
|
+
end
|
data/spec/proxy_spec.rb
CHANGED
@@ -1,26 +1,6 @@
|
|
1
1
|
require File.join( File.dirname(__FILE__), "spec_helper" )
|
2
2
|
|
3
3
|
describe DataMapper::YS do
|
4
|
-
class ::BlankHtml
|
5
|
-
include DataMapper::YS
|
6
|
-
uri spec_data_path("blank.html")
|
7
|
-
end
|
8
|
-
|
9
|
-
class ::BlankStyle
|
10
|
-
include DataMapper::YS
|
11
|
-
uri spec_data_path("plugins1.html")
|
12
|
-
end
|
13
|
-
|
14
|
-
class ::TableStyle < BlankStyle
|
15
|
-
uri spec_data_path("plugins1.html")
|
16
|
-
table "table.main"
|
17
|
-
end
|
18
|
-
|
19
|
-
class ::TheadStyle < BlankStyle
|
20
|
-
uri spec_data_path("plugins1.html")
|
21
|
-
thead "table.main"
|
22
|
-
end
|
23
|
-
|
24
4
|
######################################################################
|
25
5
|
### Config
|
26
6
|
|
@@ -120,6 +100,5 @@ describe DataMapper::YS do
|
|
120
100
|
["Name", "Repos", "Registered by", "Description", ""]
|
121
101
|
end
|
122
102
|
end
|
123
|
-
|
124
103
|
end
|
125
104
|
end
|
data/spec/scraper_spec.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
@@ -32,6 +32,8 @@ rescue LoadError
|
|
32
32
|
end
|
33
33
|
|
34
34
|
mapping = {
|
35
|
+
"http://merbi.st/plugins/" => spec_data_path("plugins1.html"),
|
36
|
+
|
35
37
|
# plugin (paginated)
|
36
38
|
"http://merbi.st/plugins/index?page=1" => spec_data_path("plugins1.html"),
|
37
39
|
"http://merbi.st/plugins/index?page=2" => spec_data_path("plugins2.html"),
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require File.join( File.dirname(__FILE__), "spec_helper" )
|
2
|
+
|
3
|
+
describe DataMapper::YS::Scraper::Composite do
|
4
|
+
describe "UniqPlugin" do
|
5
|
+
it "should return 2 pages" do
|
6
|
+
UniqPlugin.proxy.pages.size.should == 2
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "#count" do
|
10
|
+
it "should return same value as Plugin" do
|
11
|
+
UniqPlugin1.count.should == 2
|
12
|
+
UniqPlugin2.count.should == 2
|
13
|
+
UniqPlugin .count.should == 3
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: maiha-dm-ys
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- maiha
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-03-
|
12
|
+
date: 2009-03-08 00:00:00 -08:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -59,6 +59,7 @@ files:
|
|
59
59
|
- lib/dm-ys
|
60
60
|
- lib/dm-ys/base.rb
|
61
61
|
- lib/dm-ys/config.rb
|
62
|
+
- lib/dm-ys/element_property.rb
|
62
63
|
- lib/dm-ys/indexed_property.rb
|
63
64
|
- lib/dm-ys/memory_repository.rb
|
64
65
|
- lib/dm-ys/scraper.rb
|
@@ -69,15 +70,19 @@ files:
|
|
69
70
|
- spec/data/uniq2.html
|
70
71
|
- spec/data/sorted2.html
|
71
72
|
- spec/data/sorted1.html
|
73
|
+
- spec/data/th.html
|
72
74
|
- spec/data/plugins1.html
|
73
75
|
- spec/data/gem_maintainers.html
|
74
76
|
- spec/data/plugins2.html
|
75
77
|
- spec/data/uniq1.html
|
76
78
|
- spec/data/blank.html
|
77
79
|
- spec/models
|
80
|
+
- spec/models/style.rb
|
78
81
|
- spec/models/gem_maintainer.rb
|
79
82
|
- spec/models/plugin.rb
|
80
83
|
- spec/composite_scraper_spec.rb
|
84
|
+
- spec/uniq_record_spec.rb
|
85
|
+
- spec/attribute_spec.rb
|
81
86
|
- spec/anonymous_spec.rb
|
82
87
|
- spec/scraper_spec.rb
|
83
88
|
- spec/indexed_property_spec.rb
|