maiha-dm-ys 0.1 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README +14 -10
- data/Rakefile +1 -1
- data/lib/dm-ys/cached_accessor.rb +3 -1
- data/lib/dm-ys/proxy.rb +7 -2
- data/lib/dm-ys/scraper.rb +80 -4
- data/spec/guess_spec.rb +90 -0
- data/spec/spec_helper.rb +12 -0
- metadata +6 -1
data/README
CHANGED
@@ -6,18 +6,22 @@ a DataMapper extension that uses html table as its schema and data powerfully li
|
|
6
6
|
Example
|
7
7
|
=======
|
8
8
|
|
9
|
-
class
|
9
|
+
class Maintainer
|
10
10
|
include DataMapper::YunkerStar
|
11
|
-
|
12
|
-
uri "http://ds.gkwiki2.com/47.html"
|
13
|
-
thead "table.style_table thead tr"
|
14
|
-
tbody "table.style_table tbody tr"
|
11
|
+
uri "http://datamapper.org/doku.php?id=gem_maintainers"
|
15
12
|
end
|
16
|
-
|
17
|
-
irb(main):001:0>
|
18
|
-
=>
|
19
|
-
|
20
|
-
|
13
|
+
|
14
|
+
irb(main):001:0> Maintainer.count
|
15
|
+
=> 31
|
16
|
+
|
17
|
+
irb(main):002:0> Maintainer.first
|
18
|
+
=> #<Maintainer id=nil _Gem="data_objects" _Maintainer="dbussink" Name="Dirkjan Bussink">
|
19
|
+
|
20
|
+
irb(main):003:0> Maintainer.names
|
21
|
+
=> ["_Gem", "_Maintainer", "Name"]
|
22
|
+
|
23
|
+
irb(main):004:0> Maintainer.labels
|
24
|
+
=> ["Gem", "Maintainer", "Name"]
|
21
25
|
|
22
26
|
|
23
27
|
Copyright (c) 2008 maiha@wota.jp, released under the MIT license
|
data/Rakefile
CHANGED
@@ -33,7 +33,7 @@ AUTHOR = "maiha"
|
|
33
33
|
EMAIL = "maiha@wota.jp"
|
34
34
|
HOMEPAGE = "http://github.com/maiha/dm-ys"
|
35
35
|
SUMMARY = "a DataMapper extension that uses html table as its schema and data powerfully like YunkerStar"
|
36
|
-
GEM_VERSION = "0.
|
36
|
+
GEM_VERSION = "0.2"
|
37
37
|
|
38
38
|
spec = Gem::Specification.new do |s|
|
39
39
|
# s.rubyforge_project = 'merb'
|
@@ -21,7 +21,9 @@ module DataMapper
|
|
21
21
|
end
|
22
22
|
|
23
23
|
def method_missing(symbol, &block)
|
24
|
-
|
24
|
+
cached = "__cached__#{symbol}"
|
25
|
+
@klass.send(:define_method, cached, &block)
|
26
|
+
@klass.class_eval("def #{symbol}; @#{cached} ||= #{cached}; end", "(__CACHED_ACCESSOR__)", 1)
|
25
27
|
end
|
26
28
|
end
|
27
29
|
end
|
data/lib/dm-ys/proxy.rb
CHANGED
@@ -15,6 +15,7 @@ module DataMapper
|
|
15
15
|
model.class_eval do
|
16
16
|
extend ClassMethods
|
17
17
|
dsl_accessor :uri
|
18
|
+
dsl_accessor :table
|
18
19
|
dsl_accessor :tbody
|
19
20
|
dsl_accessor :thead
|
20
21
|
property :id, DataMapper::Types::Serial
|
@@ -28,13 +29,17 @@ module DataMapper
|
|
28
29
|
|
29
30
|
def lazy_load
|
30
31
|
loader = Scraper.new(self)
|
31
|
-
loader.
|
32
|
+
loader.names.each do |name|
|
32
33
|
type = String # TODO
|
33
34
|
property name.intern, type
|
34
35
|
end
|
35
36
|
return loader
|
36
37
|
end
|
37
38
|
|
39
|
+
def names
|
40
|
+
proxy.names
|
41
|
+
end
|
42
|
+
|
38
43
|
def labels
|
39
44
|
proxy.labels
|
40
45
|
end
|
@@ -49,7 +54,7 @@ module DataMapper
|
|
49
54
|
|
50
55
|
def all
|
51
56
|
@all ||= proxy.entries.map{|array|
|
52
|
-
new(Hash[*proxy.
|
57
|
+
new(Hash[*proxy.names.zip(array).flatten])
|
53
58
|
}
|
54
59
|
end
|
55
60
|
|
data/lib/dm-ys/scraper.rb
CHANGED
@@ -11,17 +11,93 @@ module DataMapper
|
|
11
11
|
class Scraper
|
12
12
|
include CachedAccessor
|
13
13
|
|
14
|
+
class TableNotFound < RuntimeError; end
|
15
|
+
|
16
|
+
attr_reader :html
|
17
|
+
|
14
18
|
def initialize(model)
|
19
|
+
raise ArgumentError, "missing model" unless model
|
20
|
+
raise ArgumentError, "missing uri" unless model.uri
|
15
21
|
@model = model
|
16
|
-
@html = NKF.nkf('-w', open(
|
22
|
+
@html = NKF.nkf('-w', open(model.uri).read)
|
23
|
+
@invalid_name_count = 0
|
24
|
+
end
|
25
|
+
|
26
|
+
def guess_table
|
27
|
+
[max_table_from("table"), max_table_from("table > tbody")].sort_by(&:first).last.last or
|
28
|
+
raise TableNotFound, "set 'table' or 'tbody' manually"
|
17
29
|
end
|
18
30
|
|
19
31
|
cached_accessor do
|
20
32
|
doc {Hpricot(@html)}
|
21
|
-
|
22
|
-
|
33
|
+
table {specified(:table) or guess_table}
|
34
|
+
thead {specified(:thead) or table.search("> thead").first or table}
|
35
|
+
tbody {specified(:tbody) or table.search("> tbody").first or table}
|
36
|
+
names {labels.map{|i| label2name(i)}}
|
37
|
+
labels {thead.search("> tr").first.search("> td|th").map{|i|strip_tags(i.inner_html)}}
|
38
|
+
entries {tbody.search("> tr").map{|tr| tr.search("> td").map{|i|strip_tags(i.inner_html)}}.delete_if{|i|i.blank?}}
|
23
39
|
end
|
24
|
-
end
|
25
40
|
|
41
|
+
private
|
42
|
+
|
43
|
+
def max_table_from(entry)
|
44
|
+
table = nil
|
45
|
+
count = -1
|
46
|
+
doc.search(entry).each do |t|
|
47
|
+
size = t.search("> tr").size
|
48
|
+
if size > count
|
49
|
+
count = size
|
50
|
+
table = t
|
51
|
+
end
|
52
|
+
end
|
53
|
+
[count, table]
|
54
|
+
end
|
55
|
+
|
56
|
+
def specified(name)
|
57
|
+
@model.respond_to?(name) or raise ArgumentError, "invalid selector name: #{name}"
|
58
|
+
css = @model.__send__(name) or return nil
|
59
|
+
|
60
|
+
element = doc.search(css)
|
61
|
+
case element
|
62
|
+
when Hpricot::Elem
|
63
|
+
return element
|
64
|
+
when Hpricot::Elements
|
65
|
+
return element.first
|
66
|
+
else
|
67
|
+
return nil
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def label2name(label)
|
72
|
+
require 'cgi'
|
73
|
+
label = CGI.unescapeHTML(label)
|
74
|
+
label.gsub!(/ /, '')
|
75
|
+
label.gsub!(/\r?\n/, '')
|
76
|
+
label.delete!('!"#$%&()=~|`{}^-[]/<>:; \\')
|
77
|
+
label.delete!("'")
|
78
|
+
label.strip!
|
79
|
+
|
80
|
+
if /^([A-Z])/ === label and Object.const_defined?(label)
|
81
|
+
label = "_#{label}"
|
82
|
+
end
|
83
|
+
if label.blank? or @model.respond_to?(label, true)
|
84
|
+
new_name_for(label)
|
85
|
+
elsif /^[0-9]/ === label
|
86
|
+
"_#{label}"
|
87
|
+
else
|
88
|
+
label
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def new_name_for(label)
|
93
|
+
@invalid_name_count += 1
|
94
|
+
"col_#{@invalid_name_count}"
|
95
|
+
end
|
96
|
+
|
97
|
+
def strip_tags(html)
|
98
|
+
html.gsub(/<.*?>/, '').strip
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
26
102
|
end
|
27
103
|
end
|
data/spec/guess_spec.rb
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
require File.join( File.dirname(__FILE__), "spec_helper" )
|
2
|
+
|
3
|
+
describe DataMapper::YunkerStar do
|
4
|
+
class BlankHtml
|
5
|
+
include DataMapper::YunkerStar
|
6
|
+
uri spec_data_path("blank.html")
|
7
|
+
end
|
8
|
+
|
9
|
+
class BlankStyle
|
10
|
+
include DataMapper::YunkerStar
|
11
|
+
uri spec_data_path("ki.html")
|
12
|
+
end
|
13
|
+
|
14
|
+
class TableStyle < BlankStyle
|
15
|
+
uri spec_data_path("ki.html")
|
16
|
+
table "table.main"
|
17
|
+
end
|
18
|
+
|
19
|
+
class TheadStyle < BlankStyle
|
20
|
+
uri spec_data_path("ki.html")
|
21
|
+
thead "table.main"
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should provide proxy" do
|
25
|
+
BlankStyle.should respond_to(:proxy)
|
26
|
+
end
|
27
|
+
|
28
|
+
describe ".proxy" do
|
29
|
+
it "should provide guess_table" do
|
30
|
+
BlankStyle.proxy.should respond_to(:guess_table)
|
31
|
+
end
|
32
|
+
|
33
|
+
describe "#guess_table" do
|
34
|
+
it "should return a Hpricot::Elem" do
|
35
|
+
BlankStyle.proxy.guess_table.class.should == Hpricot::Elem
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should return a right element" do
|
39
|
+
BlankStyle.proxy.guess_table["class"].should == "main"
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should raise when the html contains no tables" do
|
43
|
+
lambda {
|
44
|
+
BlankHtml.proxy.guess_table
|
45
|
+
}.should raise_error(DataMapper::YunkerStar::Scraper::TableNotFound)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should provide table" do
|
50
|
+
BlankStyle.proxy.should respond_to(:table)
|
51
|
+
|
52
|
+
describe "#table" do
|
53
|
+
it "should raise when the html contains no tables" do
|
54
|
+
lambda {
|
55
|
+
BlankHtml.proxy.table
|
56
|
+
}.should raise_error(DataMapper::YunkerStar::Scraper::TableNotFound)
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should return specified table" do
|
60
|
+
table = TableStyle.proxy.table
|
61
|
+
table.class.should == Hpricot::Elem
|
62
|
+
table[:class].should == "main"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
it "should provide thead" do
|
68
|
+
BlankStyle.proxy.should respond_to(:thead)
|
69
|
+
|
70
|
+
describe "#thead" do
|
71
|
+
it "should raise when the html contains no tables" do
|
72
|
+
lambda {
|
73
|
+
BlankHtml.proxy.thead
|
74
|
+
}.should raise_error(DataMapper::YunkerStar::Scraper::TableNotFound)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
it "should provide labels" do
|
80
|
+
BlankStyle.proxy.should respond_to(:labels)
|
81
|
+
|
82
|
+
describe "#labels" do
|
83
|
+
it "should return th values" do
|
84
|
+
BlankStyle.proxy.labels.map(&:strip).should == %w( col1 col2 col3 col4 )
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: maiha-dm-ys
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: "0.
|
4
|
+
version: "0.2"
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- maiha
|
@@ -59,6 +59,11 @@ files:
|
|
59
59
|
- lib/dm-ys/scraper.rb
|
60
60
|
- lib/dm-ys/cached_accessor.rb
|
61
61
|
- lib/dm-ys/proxy.rb
|
62
|
+
- spec/guess_spec.rb
|
63
|
+
- spec/data
|
64
|
+
- spec/data/ki.html
|
65
|
+
- spec/data/blank.html
|
66
|
+
- spec/spec_helper.rb
|
62
67
|
has_rdoc: true
|
63
68
|
homepage: http://github.com/maiha/dm-ys
|
64
69
|
post_install_message:
|