maiha-dm-ys 0.4 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +24 -1
- data/Rakefile +1 -1
- data/lib/dm-ys/config.rb +5 -1
- data/lib/dm-ys/element_property.rb +47 -0
- data/lib/dm-ys/scraper.rb +25 -5
- data/spec/attribute_spec.rb +42 -0
- data/spec/config_spec.rb +9 -0
- data/spec/data/th.html +108 -0
- data/spec/models/style.rb +24 -0
- data/spec/proxy_spec.rb +0 -21
- data/spec/scraper_spec.rb +6 -0
- data/spec/spec_helper.rb +2 -0
- data/spec/uniq_record_spec.rb +17 -0
- metadata +7 -2
data/README
CHANGED
@@ -64,8 +64,31 @@ Append "*" to uri if you want pagination mode.
|
|
64
64
|
=> 36
|
65
65
|
|
66
66
|
|
67
|
+
Raw Element
|
68
|
+
===========
|
69
|
+
|
70
|
+
"element_for" method returns raw scraped elements those are currently hpricot elements.
|
71
|
+
|
72
|
+
>> Plugin.names #=> ["Name", "Repos", "Registeredby", "Description", "col_1"]
|
73
|
+
>> record = Plugin.first #=> #<Plugin id=1 Name="eventmachine-0.12.5" Repos=...>
|
74
|
+
>> record.Name #=> "eventmachine-0.12.5"
|
75
|
+
>> record.element_for("Name") #=> {elem td {elem a href"/plugins/36" "eventmachine-0.12.5" a} td}
|
76
|
+
>> record.element_for("Name").class #=> Hpricot::Elem
|
77
|
+
|
78
|
+
"link_for" is syntax sugar for extracting href tag from its element.
|
79
|
+
|
80
|
+
>> record.link_for("Name") #=> "http://merbi.st/plugins/36"
|
81
|
+
>> record.link_for("Registeredby") #=> "http://merbi.st/users/1"
|
82
|
+
|
83
|
+
Furthermore, :only_path option can control the fully qualified URL or not.
|
84
|
+
|
85
|
+
>> Plugin.ys[:only_path] = true
|
86
|
+
>> record.link_for("Name") #=> "/plugins/36"
|
87
|
+
>> record.link_for("Registeredby") #=> "/users/1"
|
88
|
+
|
89
|
+
|
67
90
|
TODO
|
68
91
|
====
|
69
|
-
*
|
92
|
+
* Feel free to request what you want! :)
|
70
93
|
|
71
94
|
Copyright (c) 2008 maiha@wota.jp, released under the MIT license
|
data/Rakefile
CHANGED
@@ -33,7 +33,7 @@ AUTHOR = "maiha"
|
|
33
33
|
EMAIL = "maiha@wota.jp"
|
34
34
|
HOMEPAGE = "http://github.com/maiha/dm-ys"
|
35
35
|
SUMMARY = "a DataMapper extension that uses html table as its schema and data powerfully like YunkerStar"
|
36
|
-
GEM_VERSION = "0.4"
|
36
|
+
GEM_VERSION = "0.4.1"
|
37
37
|
|
38
38
|
spec = Gem::Specification.new do |s|
|
39
39
|
# s.rubyforge_project = 'merb'
|
data/lib/dm-ys/config.rb
CHANGED
@@ -2,7 +2,7 @@ module DataMapper
|
|
2
2
|
module YS
|
3
3
|
class Config
|
4
4
|
def self.default
|
5
|
-
{:max_pages=>100, :uniq=>true}
|
5
|
+
{:max_pages=>100, :uniq=>true, :only_path=>false}
|
6
6
|
end
|
7
7
|
|
8
8
|
def initialize(options = nil)
|
@@ -25,6 +25,10 @@ module DataMapper
|
|
25
25
|
def uniq_entry?
|
26
26
|
self[:uniq] == true or self[:uniq] == :entry
|
27
27
|
end
|
28
|
+
|
29
|
+
def only_path?
|
30
|
+
!!self[:only_path]
|
31
|
+
end
|
28
32
|
end
|
29
33
|
end
|
30
34
|
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module DataMapper
|
2
|
+
module YS
|
3
|
+
|
4
|
+
# ==== Example
|
5
|
+
#
|
6
|
+
# Class Foo
|
7
|
+
# include DataMapper::YS
|
8
|
+
# uri ...
|
9
|
+
#
|
10
|
+
# # <tr><th>name</th>...
|
11
|
+
# # <tr><td><a href="/plugins/36">dm-ys</a></td>...
|
12
|
+
#
|
13
|
+
# foo = Foo.first
|
14
|
+
# foo.link_for(:name) # => "/plugins/36"
|
15
|
+
|
16
|
+
module ElementProperty
|
17
|
+
def link_for(key)
|
18
|
+
links_for(key).first
|
19
|
+
end
|
20
|
+
|
21
|
+
def links_for(key)
|
22
|
+
key = normalized_property_for(key)
|
23
|
+
(@links[key.to_s] || []).map do |url|
|
24
|
+
if self.class.ys.only_path?
|
25
|
+
url
|
26
|
+
else
|
27
|
+
(self.class.proxy.base_uri + url).to_s
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def element_for(key)
|
33
|
+
key = normalized_property_for(key)
|
34
|
+
@elements[key.to_s]
|
35
|
+
end
|
36
|
+
|
37
|
+
def links=(value)
|
38
|
+
@links = value
|
39
|
+
end
|
40
|
+
|
41
|
+
def elements=(value)
|
42
|
+
@elements = value
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
data/lib/dm-ys/scraper.rb
CHANGED
@@ -68,6 +68,10 @@ module DataMapper
|
|
68
68
|
@uri || @model.uri.to_s.chomp('*')
|
69
69
|
end
|
70
70
|
|
71
|
+
def base_uri
|
72
|
+
URI.parse(uri.split('?').first)
|
73
|
+
end
|
74
|
+
|
71
75
|
def register_properties!
|
72
76
|
names.each do |name|
|
73
77
|
type = String # TODO
|
@@ -95,7 +99,7 @@ module DataMapper
|
|
95
99
|
end
|
96
100
|
|
97
101
|
def pagination_links
|
98
|
-
base =
|
102
|
+
base = base_uri
|
99
103
|
urls = (doc / "a").map{|i| i[:href] =~ /^http/ ? i[:href] : (base+i[:href]).to_s}.uniq
|
100
104
|
urls.select{|url| /^#{Regexp.escape(base.to_s)}/ === url}
|
101
105
|
end
|
@@ -122,12 +126,17 @@ module DataMapper
|
|
122
126
|
labels {thead.search("> tr").first.search("> td|th").map{|i|strip_tags(i.inner_html)}}
|
123
127
|
records {
|
124
128
|
tbody.search("> tr").map do |tr|
|
125
|
-
elems
|
126
|
-
|
129
|
+
elems = tr.search("> td")
|
130
|
+
next if elems.blank? # ignored because this should be TH columns
|
131
|
+
|
132
|
+
values = elems.map{|i|strip_tags(i.inner_html)}
|
133
|
+
elements = Hash[*names.zip(elems).flatten]
|
134
|
+
|
127
135
|
record = @model.new(Hash[*names.zip(values).flatten])
|
128
|
-
record.elements =
|
136
|
+
record.elements = elements
|
137
|
+
record.links = names.inject({}){|h,name| h[name] = links_for(elements[name]); h}
|
129
138
|
record
|
130
|
-
end
|
139
|
+
end.compact
|
131
140
|
}
|
132
141
|
end
|
133
142
|
|
@@ -184,6 +193,17 @@ module DataMapper
|
|
184
193
|
def strip_tags(html)
|
185
194
|
html.gsub(/<.*?>/, '').strip
|
186
195
|
end
|
196
|
+
|
197
|
+
def links_for(element)
|
198
|
+
case element
|
199
|
+
when Hpricot::Elem
|
200
|
+
return Array(element.search("a")).map{|i| i[:href]}
|
201
|
+
when Hpricot::Elements
|
202
|
+
return element.map{|e| links_for(e)}.flatten
|
203
|
+
else
|
204
|
+
return []
|
205
|
+
end
|
206
|
+
end
|
187
207
|
end
|
188
208
|
|
189
209
|
######################################################################
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require File.join( File.dirname(__FILE__), "spec_helper" )
|
2
|
+
|
3
|
+
describe DataMapper::YS, "(a record)" do
|
4
|
+
class OnlyPath
|
5
|
+
include DataMapper::YS
|
6
|
+
uri "http://merbi.st/plugins/"
|
7
|
+
ys[:only_path] = true
|
8
|
+
end
|
9
|
+
|
10
|
+
before(:each) do
|
11
|
+
@record = Plugin.first
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should provide #link_for" do
|
15
|
+
@record.should respond_to(:link_for)
|
16
|
+
end
|
17
|
+
|
18
|
+
describe "#link_for" do
|
19
|
+
it "should return first link if its element has href attributes" do
|
20
|
+
@record.link_for("Name").should == "http://merbi.st/plugins/36"
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should return nil if its element has no href attributes" do
|
24
|
+
@record.link_for("Description").should == nil
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should return only path link when :only_path is true" do
|
28
|
+
OnlyPath.first.link_for("Name").should == "/plugins/36"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should provide #element_for" do
|
33
|
+
@record.should respond_to(:element_for)
|
34
|
+
end
|
35
|
+
|
36
|
+
describe "#element_for" do
|
37
|
+
it "should return first link if its element has href attributes" do
|
38
|
+
@record.element_for("Name").to_s.should == "<td><a href=\"/plugins/36\">eventmachine-0.12.5</a></td>"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
data/spec/config_spec.rb
CHANGED
@@ -63,6 +63,15 @@ describe DataMapper::YS::Config do
|
|
63
63
|
config = DataMapper::YS::Config.new(:uniq=>false)
|
64
64
|
config.uniq_entry?.should == false
|
65
65
|
end
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should provide #only_path?" do
|
69
|
+
@config.should respond_to(:only_path?)
|
70
|
+
end
|
66
71
|
|
72
|
+
describe "[:only_path]" do
|
73
|
+
it "should has false as default value" do
|
74
|
+
@config[:only_path].should == false
|
75
|
+
end
|
67
76
|
end
|
68
77
|
end
|
data/spec/data/th.html
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
2
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-us" lang="en-us">
|
3
|
+
<head>
|
4
|
+
<title>Merbist Plugins</title>
|
5
|
+
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
|
6
|
+
<meta name="verify-v1"
|
7
|
+
content="QqJ1Kmvs51kF+1Sn+7JUqkXTXbnmLVKzFctoGLRDLE8=" />
|
8
|
+
<link rel="stylesheet" href="/stylesheets/master.css" type="text/css"
|
9
|
+
media="screen" charset="utf-8" />
|
10
|
+
<script src="/javascripts/jquery.js" type="text/javascript"></script>
|
11
|
+
<link rel="alternate" type="application/rss+xml" title="Atom" href="/plugins.atom"></link>
|
12
|
+
|
13
|
+
</head>
|
14
|
+
<body>
|
15
|
+
<div id="base">
|
16
|
+
<div id="header">
|
17
|
+
<div id="navi">
|
18
|
+
<a href="/">Top</a>
|
19
|
+
<a href="/plugins">Plugins</a>
|
20
|
+
<a href="/talks">Talks</a>
|
21
|
+
<a href="/sites">Sites</a>
|
22
|
+
<a href="/users">Users</a>
|
23
|
+
</div>
|
24
|
+
<div id="menu">
|
25
|
+
<a href="/login">Login</a>
|
26
|
+
<a href="/users/new">Sing up</a>
|
27
|
+
</div>
|
28
|
+
<div class="clear"><!----></div>
|
29
|
+
|
30
|
+
</div>
|
31
|
+
<div id="body">
|
32
|
+
<div class="command">
|
33
|
+
<a href="/plugins">Index</a>
|
34
|
+
</div>
|
35
|
+
|
36
|
+
|
37
|
+
<h1>Plugin List</h1>
|
38
|
+
|
39
|
+
<table class="plugin-list" cellspacing="1" border="0">
|
40
|
+
<tr>
|
41
|
+
<th>Name</th>
|
42
|
+
<th>Repos</th>
|
43
|
+
<th>Registered by</th>
|
44
|
+
<th>Description</th>
|
45
|
+
<th></th>
|
46
|
+
</tr>
|
47
|
+
<tr class="even">
|
48
|
+
<td><a href="/plugins/36">eventmachine-0.12.5</a></td>
|
49
|
+
<td>†</td>
|
50
|
+
<td><a href="/users/1">genki</a></td>
|
51
|
+
<td>EventMachine</td>
|
52
|
+
<td>
|
53
|
+
</td>
|
54
|
+
</tr>
|
55
|
+
<tr class="">
|
56
|
+
<td><a href="/plugins/35">dm-last-0.0.1</a></td>
|
57
|
+
<td>†</td>
|
58
|
+
<td><a href="/users/1">genki</a></td>
|
59
|
+
<td>Model.last</td>
|
60
|
+
<td>
|
61
|
+
</td>
|
62
|
+
</tr>
|
63
|
+
</table>
|
64
|
+
|
65
|
+
<div class="pagination"><span class="prev disabled">« Prev</span>
|
66
|
+
<span class="current disabled">1</span>
|
67
|
+
<span class="disabled"><a href="/plugins/uniq?page=2">2</a></span>
|
68
|
+
<a class="next" rel="next" href="/plugins/uniq?page=2">Next »</a></div>
|
69
|
+
|
70
|
+
<div class="footnote">
|
71
|
+
† You can install these gems by
|
72
|
+
<code>
|
73
|
+
gem install gem-name --source http://merbi.st
|
74
|
+
</code>
|
75
|
+
</div>
|
76
|
+
|
77
|
+
|
78
|
+
</div>
|
79
|
+
<div id="footer">
|
80
|
+
<div id="footer">
|
81
|
+
2008
|
82
|
+
<a href="http://wota.jp/ac/">maiha</a>,
|
83
|
+
<a href="http://d.jong.gr.jp/shachi">shachi</a>
|
84
|
+
and
|
85
|
+
<a href="http://blog.s21g.com/genki">genki</a>
|
86
|
+
(<a href="http://www.s21g.com/">s21g LLC</a>).
|
87
|
+
<span class="powered-by">
|
88
|
+
Powered by
|
89
|
+
Merb-1.0.9
|
90
|
+
(Ruby-1.9.1)
|
91
|
+
</span>
|
92
|
+
</div>
|
93
|
+
|
94
|
+
</div>
|
95
|
+
</div>
|
96
|
+
|
97
|
+
<script type="text/javascript">
|
98
|
+
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
|
99
|
+
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
|
100
|
+
</script>
|
101
|
+
<script type="text/javascript">
|
102
|
+
try {
|
103
|
+
var pageTracker = _gat._getTracker("UA-2733799-11");
|
104
|
+
pageTracker._trackPageview();
|
105
|
+
} catch(err) {}</script>
|
106
|
+
|
107
|
+
</body>
|
108
|
+
</html>
|
@@ -0,0 +1,24 @@
|
|
1
|
+
class BlankHtml
|
2
|
+
include DataMapper::YS
|
3
|
+
uri spec_data_path("blank.html")
|
4
|
+
end
|
5
|
+
|
6
|
+
class BlankStyle
|
7
|
+
include DataMapper::YS
|
8
|
+
uri spec_data_path("plugins1.html")
|
9
|
+
end
|
10
|
+
|
11
|
+
class TableStyle < BlankStyle
|
12
|
+
uri spec_data_path("plugins1.html")
|
13
|
+
table "table.main"
|
14
|
+
end
|
15
|
+
|
16
|
+
class TheadStyle < BlankStyle
|
17
|
+
uri spec_data_path("plugins1.html")
|
18
|
+
thead "table.main"
|
19
|
+
end
|
20
|
+
|
21
|
+
class ThStyle
|
22
|
+
include DataMapper::YS
|
23
|
+
uri spec_data_path("th.html")
|
24
|
+
end
|
data/spec/proxy_spec.rb
CHANGED
@@ -1,26 +1,6 @@
|
|
1
1
|
require File.join( File.dirname(__FILE__), "spec_helper" )
|
2
2
|
|
3
3
|
describe DataMapper::YS do
|
4
|
-
class ::BlankHtml
|
5
|
-
include DataMapper::YS
|
6
|
-
uri spec_data_path("blank.html")
|
7
|
-
end
|
8
|
-
|
9
|
-
class ::BlankStyle
|
10
|
-
include DataMapper::YS
|
11
|
-
uri spec_data_path("plugins1.html")
|
12
|
-
end
|
13
|
-
|
14
|
-
class ::TableStyle < BlankStyle
|
15
|
-
uri spec_data_path("plugins1.html")
|
16
|
-
table "table.main"
|
17
|
-
end
|
18
|
-
|
19
|
-
class ::TheadStyle < BlankStyle
|
20
|
-
uri spec_data_path("plugins1.html")
|
21
|
-
thead "table.main"
|
22
|
-
end
|
23
|
-
|
24
4
|
######################################################################
|
25
5
|
### Config
|
26
6
|
|
@@ -120,6 +100,5 @@ describe DataMapper::YS do
|
|
120
100
|
["Name", "Repos", "Registered by", "Description", ""]
|
121
101
|
end
|
122
102
|
end
|
123
|
-
|
124
103
|
end
|
125
104
|
end
|
data/spec/scraper_spec.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
@@ -32,6 +32,8 @@ rescue LoadError
|
|
32
32
|
end
|
33
33
|
|
34
34
|
mapping = {
|
35
|
+
"http://merbi.st/plugins/" => spec_data_path("plugins1.html"),
|
36
|
+
|
35
37
|
# plugin (paginated)
|
36
38
|
"http://merbi.st/plugins/index?page=1" => spec_data_path("plugins1.html"),
|
37
39
|
"http://merbi.st/plugins/index?page=2" => spec_data_path("plugins2.html"),
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require File.join( File.dirname(__FILE__), "spec_helper" )
|
2
|
+
|
3
|
+
describe DataMapper::YS::Scraper::Composite do
|
4
|
+
describe "UniqPlugin" do
|
5
|
+
it "should return 2 pages" do
|
6
|
+
UniqPlugin.proxy.pages.size.should == 2
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "#count" do
|
10
|
+
it "should return same value as Plugin" do
|
11
|
+
UniqPlugin1.count.should == 2
|
12
|
+
UniqPlugin2.count.should == 2
|
13
|
+
UniqPlugin .count.should == 3
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: maiha-dm-ys
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- maiha
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-03-
|
12
|
+
date: 2009-03-08 00:00:00 -08:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -59,6 +59,7 @@ files:
|
|
59
59
|
- lib/dm-ys
|
60
60
|
- lib/dm-ys/base.rb
|
61
61
|
- lib/dm-ys/config.rb
|
62
|
+
- lib/dm-ys/element_property.rb
|
62
63
|
- lib/dm-ys/indexed_property.rb
|
63
64
|
- lib/dm-ys/memory_repository.rb
|
64
65
|
- lib/dm-ys/scraper.rb
|
@@ -69,15 +70,19 @@ files:
|
|
69
70
|
- spec/data/uniq2.html
|
70
71
|
- spec/data/sorted2.html
|
71
72
|
- spec/data/sorted1.html
|
73
|
+
- spec/data/th.html
|
72
74
|
- spec/data/plugins1.html
|
73
75
|
- spec/data/gem_maintainers.html
|
74
76
|
- spec/data/plugins2.html
|
75
77
|
- spec/data/uniq1.html
|
76
78
|
- spec/data/blank.html
|
77
79
|
- spec/models
|
80
|
+
- spec/models/style.rb
|
78
81
|
- spec/models/gem_maintainer.rb
|
79
82
|
- spec/models/plugin.rb
|
80
83
|
- spec/composite_scraper_spec.rb
|
84
|
+
- spec/uniq_record_spec.rb
|
85
|
+
- spec/attribute_spec.rb
|
81
86
|
- spec/anonymous_spec.rb
|
82
87
|
- spec/scraper_spec.rb
|
83
88
|
- spec/indexed_property_spec.rb
|