maiha-dm-ys 0.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +14 -10
- data/Rakefile +1 -1
- data/lib/dm-ys/cached_accessor.rb +3 -1
- data/lib/dm-ys/proxy.rb +7 -2
- data/lib/dm-ys/scraper.rb +80 -4
- data/spec/guess_spec.rb +90 -0
- data/spec/spec_helper.rb +12 -0
- metadata +6 -1
data/README
CHANGED
@@ -6,18 +6,22 @@ a DataMapper extension that uses html table as its schema and data powerfully li
|
|
6
6
|
Example
|
7
7
|
=======
|
8
8
|
|
9
|
-
class
|
9
|
+
class Maintainer
|
10
10
|
include DataMapper::YunkerStar
|
11
|
-
|
12
|
-
uri "http://ds.gkwiki2.com/47.html"
|
13
|
-
thead "table.style_table thead tr"
|
14
|
-
tbody "table.style_table tbody tr"
|
11
|
+
uri "http://datamapper.org/doku.php?id=gem_maintainers"
|
15
12
|
end
|
16
|
-
|
17
|
-
irb(main):001:0>
|
18
|
-
=>
|
19
|
-
|
20
|
-
|
13
|
+
|
14
|
+
irb(main):001:0> Maintainer.count
|
15
|
+
=> 31
|
16
|
+
|
17
|
+
irb(main):002:0> Maintainer.first
|
18
|
+
=> #<Maintainer id=nil _Gem="data_objects" _Maintainer="dbussink" Name="Dirkjan Bussink">
|
19
|
+
|
20
|
+
irb(main):003:0> Maintainer.names
|
21
|
+
=> ["_Gem", "_Maintainer", "Name"]
|
22
|
+
|
23
|
+
irb(main):004:0> Maintainer.labels
|
24
|
+
=> ["Gem", "Maintainer", "Name"]
|
21
25
|
|
22
26
|
|
23
27
|
Copyright (c) 2008 maiha@wota.jp, released under the MIT license
|
data/Rakefile
CHANGED
@@ -33,7 +33,7 @@ AUTHOR = "maiha"
|
|
33
33
|
EMAIL = "maiha@wota.jp"
|
34
34
|
HOMEPAGE = "http://github.com/maiha/dm-ys"
|
35
35
|
SUMMARY = "a DataMapper extension that uses html table as its schema and data powerfully like YunkerStar"
|
36
|
-
GEM_VERSION = "0.
|
36
|
+
GEM_VERSION = "0.2"
|
37
37
|
|
38
38
|
spec = Gem::Specification.new do |s|
|
39
39
|
# s.rubyforge_project = 'merb'
|
@@ -21,7 +21,9 @@ module DataMapper
|
|
21
21
|
end
|
22
22
|
|
23
23
|
def method_missing(symbol, &block)
|
24
|
-
|
24
|
+
cached = "__cached__#{symbol}"
|
25
|
+
@klass.send(:define_method, cached, &block)
|
26
|
+
@klass.class_eval("def #{symbol}; @#{cached} ||= #{cached}; end", "(__CACHED_ACCESSOR__)", 1)
|
25
27
|
end
|
26
28
|
end
|
27
29
|
end
|
data/lib/dm-ys/proxy.rb
CHANGED
@@ -15,6 +15,7 @@ module DataMapper
|
|
15
15
|
model.class_eval do
|
16
16
|
extend ClassMethods
|
17
17
|
dsl_accessor :uri
|
18
|
+
dsl_accessor :table
|
18
19
|
dsl_accessor :tbody
|
19
20
|
dsl_accessor :thead
|
20
21
|
property :id, DataMapper::Types::Serial
|
@@ -28,13 +29,17 @@ module DataMapper
|
|
28
29
|
|
29
30
|
def lazy_load
|
30
31
|
loader = Scraper.new(self)
|
31
|
-
loader.
|
32
|
+
loader.names.each do |name|
|
32
33
|
type = String # TODO
|
33
34
|
property name.intern, type
|
34
35
|
end
|
35
36
|
return loader
|
36
37
|
end
|
37
38
|
|
39
|
+
def names
|
40
|
+
proxy.names
|
41
|
+
end
|
42
|
+
|
38
43
|
def labels
|
39
44
|
proxy.labels
|
40
45
|
end
|
@@ -49,7 +54,7 @@ module DataMapper
|
|
49
54
|
|
50
55
|
def all
|
51
56
|
@all ||= proxy.entries.map{|array|
|
52
|
-
new(Hash[*proxy.
|
57
|
+
new(Hash[*proxy.names.zip(array).flatten])
|
53
58
|
}
|
54
59
|
end
|
55
60
|
|
data/lib/dm-ys/scraper.rb
CHANGED
@@ -11,17 +11,93 @@ module DataMapper
|
|
11
11
|
class Scraper
|
12
12
|
include CachedAccessor
|
13
13
|
|
14
|
+
class TableNotFound < RuntimeError; end
|
15
|
+
|
16
|
+
attr_reader :html
|
17
|
+
|
14
18
|
def initialize(model)
|
19
|
+
raise ArgumentError, "missing model" unless model
|
20
|
+
raise ArgumentError, "missing uri" unless model.uri
|
15
21
|
@model = model
|
16
|
-
@html = NKF.nkf('-w', open(
|
22
|
+
@html = NKF.nkf('-w', open(model.uri).read)
|
23
|
+
@invalid_name_count = 0
|
24
|
+
end
|
25
|
+
|
26
|
+
def guess_table
|
27
|
+
[max_table_from("table"), max_table_from("table > tbody")].sort_by(&:first).last.last or
|
28
|
+
raise TableNotFound, "set 'table' or 'tbody' manually"
|
17
29
|
end
|
18
30
|
|
19
31
|
cached_accessor do
|
20
32
|
doc {Hpricot(@html)}
|
21
|
-
|
22
|
-
|
33
|
+
table {specified(:table) or guess_table}
|
34
|
+
thead {specified(:thead) or table.search("> thead").first or table}
|
35
|
+
tbody {specified(:tbody) or table.search("> tbody").first or table}
|
36
|
+
names {labels.map{|i| label2name(i)}}
|
37
|
+
labels {thead.search("> tr").first.search("> td|th").map{|i|strip_tags(i.inner_html)}}
|
38
|
+
entries {tbody.search("> tr").map{|tr| tr.search("> td").map{|i|strip_tags(i.inner_html)}}.delete_if{|i|i.blank?}}
|
23
39
|
end
|
24
|
-
end
|
25
40
|
|
41
|
+
private
|
42
|
+
|
43
|
+
def max_table_from(entry)
|
44
|
+
table = nil
|
45
|
+
count = -1
|
46
|
+
doc.search(entry).each do |t|
|
47
|
+
size = t.search("> tr").size
|
48
|
+
if size > count
|
49
|
+
count = size
|
50
|
+
table = t
|
51
|
+
end
|
52
|
+
end
|
53
|
+
[count, table]
|
54
|
+
end
|
55
|
+
|
56
|
+
def specified(name)
|
57
|
+
@model.respond_to?(name) or raise ArgumentError, "invalid selector name: #{name}"
|
58
|
+
css = @model.__send__(name) or return nil
|
59
|
+
|
60
|
+
element = doc.search(css)
|
61
|
+
case element
|
62
|
+
when Hpricot::Elem
|
63
|
+
return element
|
64
|
+
when Hpricot::Elements
|
65
|
+
return element.first
|
66
|
+
else
|
67
|
+
return nil
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def label2name(label)
|
72
|
+
require 'cgi'
|
73
|
+
label = CGI.unescapeHTML(label)
|
74
|
+
label.gsub!(/ /, '')
|
75
|
+
label.gsub!(/\r?\n/, '')
|
76
|
+
label.delete!('!"#$%&()=~|`{}^-[]/<>:; \\')
|
77
|
+
label.delete!("'")
|
78
|
+
label.strip!
|
79
|
+
|
80
|
+
if /^([A-Z])/ === label and Object.const_defined?(label)
|
81
|
+
label = "_#{label}"
|
82
|
+
end
|
83
|
+
if label.blank? or @model.respond_to?(label, true)
|
84
|
+
new_name_for(label)
|
85
|
+
elsif /^[0-9]/ === label
|
86
|
+
"_#{label}"
|
87
|
+
else
|
88
|
+
label
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def new_name_for(label)
|
93
|
+
@invalid_name_count += 1
|
94
|
+
"col_#{@invalid_name_count}"
|
95
|
+
end
|
96
|
+
|
97
|
+
def strip_tags(html)
|
98
|
+
html.gsub(/<.*?>/, '').strip
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
26
102
|
end
|
27
103
|
end
|
data/spec/guess_spec.rb
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
require File.join( File.dirname(__FILE__), "spec_helper" )
|
2
|
+
|
3
|
+
describe DataMapper::YunkerStar do
|
4
|
+
class BlankHtml
|
5
|
+
include DataMapper::YunkerStar
|
6
|
+
uri spec_data_path("blank.html")
|
7
|
+
end
|
8
|
+
|
9
|
+
class BlankStyle
|
10
|
+
include DataMapper::YunkerStar
|
11
|
+
uri spec_data_path("ki.html")
|
12
|
+
end
|
13
|
+
|
14
|
+
class TableStyle < BlankStyle
|
15
|
+
uri spec_data_path("ki.html")
|
16
|
+
table "table.main"
|
17
|
+
end
|
18
|
+
|
19
|
+
class TheadStyle < BlankStyle
|
20
|
+
uri spec_data_path("ki.html")
|
21
|
+
thead "table.main"
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should provide proxy" do
|
25
|
+
BlankStyle.should respond_to(:proxy)
|
26
|
+
end
|
27
|
+
|
28
|
+
describe ".proxy" do
|
29
|
+
it "should provide guess_table" do
|
30
|
+
BlankStyle.proxy.should respond_to(:guess_table)
|
31
|
+
end
|
32
|
+
|
33
|
+
describe "#guess_table" do
|
34
|
+
it "should return a Hpricot::Elem" do
|
35
|
+
BlankStyle.proxy.guess_table.class.should == Hpricot::Elem
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should return a right element" do
|
39
|
+
BlankStyle.proxy.guess_table["class"].should == "main"
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should raise when the html contains no tables" do
|
43
|
+
lambda {
|
44
|
+
BlankHtml.proxy.guess_table
|
45
|
+
}.should raise_error(DataMapper::YunkerStar::Scraper::TableNotFound)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should provide table" do
|
50
|
+
BlankStyle.proxy.should respond_to(:table)
|
51
|
+
|
52
|
+
describe "#table" do
|
53
|
+
it "should raise when the html contains no tables" do
|
54
|
+
lambda {
|
55
|
+
BlankHtml.proxy.table
|
56
|
+
}.should raise_error(DataMapper::YunkerStar::Scraper::TableNotFound)
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should return specified table" do
|
60
|
+
table = TableStyle.proxy.table
|
61
|
+
table.class.should == Hpricot::Elem
|
62
|
+
table[:class].should == "main"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
it "should provide thead" do
|
68
|
+
BlankStyle.proxy.should respond_to(:thead)
|
69
|
+
|
70
|
+
describe "#thead" do
|
71
|
+
it "should raise when the html contains no tables" do
|
72
|
+
lambda {
|
73
|
+
BlankHtml.proxy.thead
|
74
|
+
}.should raise_error(DataMapper::YunkerStar::Scraper::TableNotFound)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
it "should provide labels" do
|
80
|
+
BlankStyle.proxy.should respond_to(:labels)
|
81
|
+
|
82
|
+
describe "#labels" do
|
83
|
+
it "should return th values" do
|
84
|
+
BlankStyle.proxy.labels.map(&:strip).should == %w( col1 col2 col3 col4 )
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: maiha-dm-ys
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: "0.
|
4
|
+
version: "0.2"
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- maiha
|
@@ -59,6 +59,11 @@ files:
|
|
59
59
|
- lib/dm-ys/scraper.rb
|
60
60
|
- lib/dm-ys/cached_accessor.rb
|
61
61
|
- lib/dm-ys/proxy.rb
|
62
|
+
- spec/guess_spec.rb
|
63
|
+
- spec/data
|
64
|
+
- spec/data/ki.html
|
65
|
+
- spec/data/blank.html
|
66
|
+
- spec/spec_helper.rb
|
62
67
|
has_rdoc: true
|
63
68
|
homepage: http://github.com/maiha/dm-ys
|
64
69
|
post_install_message:
|