doc_wrapper 0.9.2 → 0.9.4
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile.lock +1 -1
- data/README.txt +63 -0
- data/lib/doc_wrapper.rb +2 -0
- data/lib/doc_wrapper/base_class_methods.rb +16 -29
- data/lib/doc_wrapper/date_property_definition.rb +3 -6
- data/lib/doc_wrapper/has_many_property_definition.rb +33 -0
- data/lib/doc_wrapper/has_one_property_definition.rb +17 -0
- data/lib/doc_wrapper/inner_html_property_definition.rb +15 -3
- data/lib/doc_wrapper/properties.rb +1 -25
- data/lib/doc_wrapper/time_property_definition.rb +2 -0
- data/lib/doc_wrapper/version.rb +1 -1
- data/spec/atom_examlple_spec.rb +2 -6
- data/spec/doc_wrapper_spec.rb +2 -2
- metadata +68 -25
data/Gemfile.lock
CHANGED
data/README.txt
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
DocWrapper is a simple DSL for creating wrappers around DOM objects.
|
2
|
+
|
3
|
+
Usage
|
4
|
+
|
5
|
+
% gem install doc_wrapper
|
6
|
+
|
7
|
+
Example Usages
|
8
|
+
|
9
|
+
DocWrapper allows you to easily create a declarative wrapper to access data from HTML Document Object Model (DOM) or XML DOM documents and optionally transform them.
|
10
|
+
|
11
|
+
DocWrapper will work with any underlying "document" that has a search method, such as a DOM generated by Nokogiri, or Hpricot. This allows the selectors used by DocWrapper to support any selector your DOM library does. Using Nokogiri, you can use either XPath or CSS selectors for very flexible property definition.
|
12
|
+
|
13
|
+
DocWrapper works by declaring properties with a name, type, and the search path to find the raw data in the DOM.
|
14
|
+
|
15
|
+
Basic Example
|
16
|
+
|
17
|
+
require 'nokogiri'
|
18
|
+
require 'doc_wrapper'
|
19
|
+
|
20
|
+
html = %{
|
21
|
+
<html>
|
22
|
+
<body>
|
23
|
+
<p class="first_name">Mark</p>
|
24
|
+
<p class="last_name">Menard</p>
|
25
|
+
</body>
|
26
|
+
</html>
|
27
|
+
}
|
28
|
+
|
29
|
+
class PersonWrapper
|
30
|
+
include DocWrapper::Base
|
31
|
+
include DocWrapper::Properties
|
32
|
+
|
33
|
+
property :first_name, :string, './p[class="first_name"]'
|
34
|
+
property :last_name, :string, './p[class="last_name"]'
|
35
|
+
end
|
36
|
+
|
37
|
+
person_wrapper = PersonWrapper.new(Nokogiri::HTML(html))
|
38
|
+
person_wrapper.first_name # => 'Mark'
|
39
|
+
person_wrapper.last_name # => 'Menard'
|
40
|
+
|
41
|
+
Supported Property Types
|
42
|
+
|
43
|
+
Currently DocWrapper support :string, :date, :time, :boolean, and :raw. Additionally DocWrapper supports embedded wrappers using has_one and has_many functionality very similar to ActiveRecord. See specs for example usages.
|
44
|
+
|
45
|
+
Access to Node Attributes
|
46
|
+
|
47
|
+
String, Date, Time and Boolean properties can reference an attribute on a node.
|
48
|
+
|
49
|
+
Given the following XML document:
|
50
|
+
|
51
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
52
|
+
<feed>
|
53
|
+
<link type="text/html" href="http://search.twitter.com/search?q=yahoo.com" rel="alternate"/>
|
54
|
+
</feed>
|
55
|
+
|
56
|
+
You can access the link href with the following property definition.
|
57
|
+
|
58
|
+
class FeedWrapper
|
59
|
+
include DocWrapper::Base
|
60
|
+
include DocWrapper::Properties
|
61
|
+
|
62
|
+
property :link, :string, '//feed/link', :use_attribute => :href
|
63
|
+
end
|
data/lib/doc_wrapper.rb
CHANGED
@@ -13,6 +13,8 @@ require 'doc_wrapper/base'
|
|
13
13
|
require 'doc_wrapper/multi_property_definition'
|
14
14
|
require 'doc_wrapper/base_property_definition'
|
15
15
|
require 'doc_wrapper/raw_property_definition'
|
16
|
+
require 'doc_wrapper/has_many_property_definition'
|
17
|
+
require 'doc_wrapper/has_one_property_definition'
|
16
18
|
require 'doc_wrapper/inner_html_property_definition'
|
17
19
|
require 'doc_wrapper/string_property_definition'
|
18
20
|
require 'doc_wrapper/date_property_definition'
|
@@ -9,29 +9,26 @@ module DocWrapper
|
|
9
9
|
end
|
10
10
|
|
11
11
|
def multi_property (property_name, selectors, options = {}, &block)
|
12
|
-
raise "Multi
|
12
|
+
raise "Multi-properties require a block" if block.nil?
|
13
13
|
add_property_definition(property_name, MultiPropertyDefinition.new(property_name, selectors, initialize_options(options), block))
|
14
14
|
end
|
15
15
|
|
16
16
|
def has_many (property_name, selector, klass, options = {})
|
17
|
-
|
17
|
+
wrapper = HasManyPropertyDefinition.new(property_name, selector, klass, initialize_options(options))
|
18
18
|
define_method property_name do
|
19
|
-
|
19
|
+
wrapper.property(documents)
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
23
|
def has_one (property_name, selector, klass, options = {})
|
24
|
-
|
24
|
+
wrapper = HasOnePropertyDefinition.new(property_name, selector, klass, initialize_options(options))
|
25
25
|
define_method(property_name) do
|
26
|
-
|
26
|
+
wrapper.property(documents)
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
30
|
def namespaces (namespaces)
|
31
31
|
@namespaces = namespaces
|
32
|
-
# define_method(:namespaces) do
|
33
|
-
# namespaces
|
34
|
-
# end
|
35
32
|
end
|
36
33
|
|
37
34
|
##################
|
@@ -40,27 +37,26 @@ module DocWrapper
|
|
40
37
|
|
41
38
|
def add_property_definition (property_name, wrapper)
|
42
39
|
add_property_name(property_name)
|
43
|
-
|
44
|
-
add_property_accessor(property_name)
|
40
|
+
add_property_accessor(property_name, wrapper)
|
45
41
|
end
|
46
42
|
|
47
43
|
# Add a property name to the singleton property_names attribute.
|
48
44
|
def add_property_name (property_name)
|
49
|
-
# Add the property name to the property_names collection.
|
50
45
|
self.property_names << property_name
|
51
46
|
end
|
52
47
|
|
53
|
-
|
54
|
-
|
55
|
-
|
48
|
+
def add_property_accessor (property_name, wrapper)
|
49
|
+
define_method(property_name) do
|
50
|
+
wrapper.property(documents)
|
51
|
+
end
|
56
52
|
end
|
57
53
|
|
58
|
-
def
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
54
|
+
def build_property_definition (property_name, type, selector, options, block)
|
55
|
+
DocWrapper.const_get("#{camelize(type.to_s)}PropertyDefinition").new(property_name, type, selector, initialize_options(options), block)
|
56
|
+
end
|
57
|
+
|
58
|
+
def camelize (string)
|
59
|
+
string.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
|
64
60
|
end
|
65
61
|
|
66
62
|
# Set default options that all properties need.
|
@@ -72,15 +68,6 @@ module DocWrapper
|
|
72
68
|
options[:namespaces] = @namespaces if @namespaces
|
73
69
|
options
|
74
70
|
end
|
75
|
-
|
76
|
-
|
77
|
-
def build_property_definition (property_name, type, selector, options, block)
|
78
|
-
DocWrapper.const_get("#{camelize(type.to_s)}PropertyDefinition").new(property_name, type, selector, initialize_options(options), block)
|
79
|
-
end
|
80
|
-
|
81
|
-
def camelize (string)
|
82
|
-
string.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
|
83
|
-
end
|
84
71
|
|
85
72
|
end
|
86
73
|
end
|
@@ -1,12 +1,9 @@
|
|
1
|
+
require 'date'
|
2
|
+
|
1
3
|
module DocWrapper
|
2
4
|
class DatePropertyDefinition < InnerHtmlPropertyDefinition
|
3
5
|
def transform (result)
|
4
|
-
|
5
|
-
result = block.call(result)
|
6
|
-
else
|
7
|
-
result = result.blank? ? nil : (options[:parser] ? options[:parser].call(result) : Date.parse(result))
|
8
|
-
end
|
9
|
-
result
|
6
|
+
block ? block.call(result) : (result.blank? ? nil : (options[:parser] ? options[:parser].call(result) : Date.parse(result)))
|
10
7
|
end
|
11
8
|
end
|
12
9
|
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
class HasManyPropertyDefinition
|
2
|
+
|
3
|
+
attr_accessor :property_name, :selector, :klass, :options, :nodes
|
4
|
+
|
5
|
+
def initialize (property_name, selector, klass, options)
|
6
|
+
@property_name = property_name
|
7
|
+
@selector = selector
|
8
|
+
@klass = klass
|
9
|
+
@options = options
|
10
|
+
end
|
11
|
+
|
12
|
+
def property (documents)
|
13
|
+
self.nodes = documents.collect do |doc|
|
14
|
+
if options[:namespaces]
|
15
|
+
result = doc.search(selector, options[:namespaces])
|
16
|
+
else
|
17
|
+
result = doc.search(selector)
|
18
|
+
end
|
19
|
+
result.blank? ? nil : result
|
20
|
+
end.flatten.compact
|
21
|
+
|
22
|
+
nodes[start_row..end_row].collect { |node| klass.new(node) }
|
23
|
+
end
|
24
|
+
|
25
|
+
def start_row
|
26
|
+
options[:start_row] ? options[:start_row] : 0
|
27
|
+
end
|
28
|
+
|
29
|
+
def end_row
|
30
|
+
options[:end_row] ? options[:end_row] : nodes.size - 1
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
class HasOnePropertyDefinition
|
2
|
+
|
3
|
+
attr_accessor :property_name, :selector, :klass, :options
|
4
|
+
|
5
|
+
def initialize (property_name, selector, klass, options)
|
6
|
+
@property_name = property_name
|
7
|
+
@selector = selector
|
8
|
+
@klass = klass
|
9
|
+
@options = options
|
10
|
+
end
|
11
|
+
|
12
|
+
def property (documents)
|
13
|
+
nodes = documents.collect { |doc| result = doc.search(selector) ; result.blank? ? nil : result }.flatten.compact
|
14
|
+
klass.new(nodes)
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
@@ -1,13 +1,25 @@
|
|
1
1
|
module DocWrapper
|
2
2
|
class InnerHtmlPropertyDefinition < BasePropertyDefinition
|
3
3
|
def property (documents)
|
4
|
+
transform(raw_property(documents))
|
5
|
+
end
|
6
|
+
|
7
|
+
def raw_property (documents)
|
8
|
+
if options[:use_attribute]
|
9
|
+
get_nodes(documents).first[options[:use_attribute]]
|
10
|
+
else
|
11
|
+
get_nodes(documents).inner_html.strip
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def get_nodes (documents)
|
4
16
|
if options[:namespaces]
|
5
|
-
|
17
|
+
documents[@options[:document] - 1].xpath(@selector, options[:namespaces])
|
6
18
|
else
|
7
19
|
begin
|
8
|
-
|
20
|
+
documents[@options[:document] - 1].search(@selector)
|
9
21
|
rescue Nokogiri::CSS::SyntaxError
|
10
|
-
|
22
|
+
documents[@options[:document] - 1].xpath(@selector)
|
11
23
|
end
|
12
24
|
end
|
13
25
|
end
|
@@ -15,34 +15,10 @@ module DocWrapper
|
|
15
15
|
def properties
|
16
16
|
result = Hash.new
|
17
17
|
property_names.each do |property|
|
18
|
-
|
19
|
-
result[property] = send(property)
|
20
|
-
rescue StandardError => e
|
21
|
-
raise e
|
22
|
-
end
|
18
|
+
result[property] = send(property)
|
23
19
|
end
|
24
20
|
result
|
25
21
|
end
|
26
22
|
|
27
|
-
def get_has_many (property_name, selector, klass, options)
|
28
|
-
nodes = @documents.collect do |doc|
|
29
|
-
if options[:namespaces]
|
30
|
-
result = doc.search(selector, options[:namespaces])
|
31
|
-
else
|
32
|
-
result = doc.search(selector)
|
33
|
-
end
|
34
|
-
result.blank? ? nil : result
|
35
|
-
end.flatten.compact
|
36
|
-
|
37
|
-
|
38
|
-
start_row = options[:start_row] ? options[:start_row] : 0
|
39
|
-
end_row = options[:end_row] ? options[:end_row] : nodes.size - 1
|
40
|
-
nodes[start_row..end_row].collect { |node| klass.new(node) }
|
41
|
-
end
|
42
|
-
|
43
|
-
def get_has_one (property_name, selector, klass, options)
|
44
|
-
nodes = @documents.collect { |doc| result = doc.search(selector) ; result.blank? ? nil : result }.flatten.compact
|
45
|
-
klass.new(nodes)
|
46
|
-
end
|
47
23
|
end
|
48
24
|
end
|
data/lib/doc_wrapper/version.rb
CHANGED
data/spec/atom_examlple_spec.rb
CHANGED
@@ -14,13 +14,9 @@ class AtomDocWrapper
|
|
14
14
|
|
15
15
|
property :twitter_id, :string, './atom:id'
|
16
16
|
property :published, :string, './atom:published'
|
17
|
-
property :link, :
|
18
|
-
node_list.first[:href]
|
19
|
-
end
|
17
|
+
property :link, :string, './atom:link[@type="text/html"]', :use_attribute => :href
|
20
18
|
property :updated, :string, './atom:updated'
|
21
|
-
property :author_avatar_link, :
|
22
|
-
node_list.first[:href]
|
23
|
-
end
|
19
|
+
property :author_avatar_link, :string, './atom:link[@type="image/png"]', :use_attribute => :href
|
24
20
|
property :author, :string, './atom:author/atom:name'
|
25
21
|
property :author_twitter_url, :string, './atom:author/atom:uri'
|
26
22
|
property :content_text, :string, './atom:title'
|
data/spec/doc_wrapper_spec.rb
CHANGED
@@ -161,8 +161,8 @@ class TestDocWrapper
|
|
161
161
|
multi_property :arrayed_xpath, ["/html/body/table[2]/tr[2]/td"] do |elements|
|
162
162
|
elements.join(" ")
|
163
163
|
end
|
164
|
-
property :raw_property, :raw, "/html/body/p[8]" do |
|
165
|
-
|
164
|
+
property :raw_property, :raw, "/html/body/p[8]" do |node_list|
|
165
|
+
node_list[0].attribute("class").inner_html
|
166
166
|
end
|
167
167
|
|
168
168
|
has_one :person, "/html/body/div", PersonWrapper
|
metadata
CHANGED
@@ -1,7 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: doc_wrapper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 51
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 9
|
9
|
+
- 4
|
10
|
+
version: 0.9.4
|
5
11
|
platform: ruby
|
6
12
|
authors:
|
7
13
|
- Mark Menard
|
@@ -9,59 +15,85 @@ autorequire:
|
|
9
15
|
bindir: bin
|
10
16
|
cert_chain: []
|
11
17
|
|
12
|
-
date: 2011-
|
18
|
+
date: 2011-06-26 00:00:00 -04:00
|
13
19
|
default_executable:
|
14
20
|
dependencies:
|
15
21
|
- !ruby/object:Gem::Dependency
|
16
22
|
name: activesupport
|
17
|
-
|
18
|
-
|
19
|
-
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
20
26
|
requirements:
|
21
27
|
- - ">="
|
22
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 7
|
30
|
+
segments:
|
31
|
+
- 3
|
32
|
+
- 0
|
33
|
+
- 0
|
23
34
|
version: 3.0.0
|
24
|
-
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: *id001
|
25
37
|
- !ruby/object:Gem::Dependency
|
26
38
|
name: bundler
|
27
|
-
|
28
|
-
|
29
|
-
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
30
42
|
requirements:
|
31
43
|
- - ">="
|
32
44
|
- !ruby/object:Gem::Version
|
45
|
+
hash: 23
|
46
|
+
segments:
|
47
|
+
- 1
|
48
|
+
- 0
|
49
|
+
- 0
|
33
50
|
version: 1.0.0
|
34
|
-
|
51
|
+
type: :development
|
52
|
+
version_requirements: *id002
|
35
53
|
- !ruby/object:Gem::Dependency
|
36
54
|
name: nokogiri
|
37
|
-
|
38
|
-
|
39
|
-
|
55
|
+
prerelease: false
|
56
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
40
58
|
requirements:
|
41
59
|
- - ">="
|
42
60
|
- !ruby/object:Gem::Version
|
61
|
+
hash: 3
|
62
|
+
segments:
|
63
|
+
- 0
|
43
64
|
version: "0"
|
44
|
-
|
65
|
+
type: :development
|
66
|
+
version_requirements: *id003
|
45
67
|
- !ruby/object:Gem::Dependency
|
46
68
|
name: rspec
|
47
|
-
|
48
|
-
|
49
|
-
|
69
|
+
prerelease: false
|
70
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
50
72
|
requirements:
|
51
73
|
- - ">="
|
52
74
|
- !ruby/object:Gem::Version
|
75
|
+
hash: 15
|
76
|
+
segments:
|
77
|
+
- 2
|
78
|
+
- 0
|
79
|
+
- 0
|
53
80
|
version: 2.0.0
|
54
|
-
|
81
|
+
type: :development
|
82
|
+
version_requirements: *id004
|
55
83
|
- !ruby/object:Gem::Dependency
|
56
84
|
name: ZenTest
|
57
|
-
|
58
|
-
|
59
|
-
|
85
|
+
prerelease: false
|
86
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
87
|
+
none: false
|
60
88
|
requirements:
|
61
89
|
- - ">="
|
62
90
|
- !ruby/object:Gem::Version
|
91
|
+
hash: 3
|
92
|
+
segments:
|
93
|
+
- 0
|
63
94
|
version: "0"
|
64
|
-
|
95
|
+
type: :development
|
96
|
+
version_requirements: *id005
|
65
97
|
description: Using the DocWrapper DSL you can easily define classes that wrap HTML DOM Documents allowing extraction of properties using either XPath or CSS selectors.
|
66
98
|
email:
|
67
99
|
- mark@enablelabs.com
|
@@ -75,6 +107,7 @@ files:
|
|
75
107
|
- .gitignore
|
76
108
|
- Gemfile
|
77
109
|
- Gemfile.lock
|
110
|
+
- README.txt
|
78
111
|
- Rakefile
|
79
112
|
- doc_wrapper.gemspec
|
80
113
|
- lib/doc_wrapper.rb
|
@@ -83,6 +116,8 @@ files:
|
|
83
116
|
- lib/doc_wrapper/base_property_definition.rb
|
84
117
|
- lib/doc_wrapper/boolean_property_definition.rb
|
85
118
|
- lib/doc_wrapper/date_property_definition.rb
|
119
|
+
- lib/doc_wrapper/has_many_property_definition.rb
|
120
|
+
- lib/doc_wrapper/has_one_property_definition.rb
|
86
121
|
- lib/doc_wrapper/inner_html_property_definition.rb
|
87
122
|
- lib/doc_wrapper/multi_property_definition.rb
|
88
123
|
- lib/doc_wrapper/properties.rb
|
@@ -107,21 +142,29 @@ rdoc_options: []
|
|
107
142
|
require_paths:
|
108
143
|
- lib
|
109
144
|
required_ruby_version: !ruby/object:Gem::Requirement
|
145
|
+
none: false
|
110
146
|
requirements:
|
111
147
|
- - ">="
|
112
148
|
- !ruby/object:Gem::Version
|
149
|
+
hash: 3
|
150
|
+
segments:
|
151
|
+
- 0
|
113
152
|
version: "0"
|
114
|
-
version:
|
115
153
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
154
|
+
none: false
|
116
155
|
requirements:
|
117
156
|
- - ">="
|
118
157
|
- !ruby/object:Gem::Version
|
158
|
+
hash: 23
|
159
|
+
segments:
|
160
|
+
- 1
|
161
|
+
- 3
|
162
|
+
- 6
|
119
163
|
version: 1.3.6
|
120
|
-
version:
|
121
164
|
requirements: []
|
122
165
|
|
123
166
|
rubyforge_project: doc_wrapper
|
124
|
-
rubygems_version: 1.
|
167
|
+
rubygems_version: 1.6.2
|
125
168
|
signing_key:
|
126
169
|
specification_version: 3
|
127
170
|
summary: Declarative DSL for defining classes to wrap HTML DOM Documents
|