doc_wrapper 0.0.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile.lock ADDED
@@ -0,0 +1,32 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ doc_wrapper (0.0.1)
5
+ activesupport (>= 3.0.0)
6
+
7
+ GEM
8
+ remote: http://rubygems.org/
9
+ specs:
10
+ ZenTest (4.4.2)
11
+ activesupport (3.0.0)
12
+ diff-lcs (1.1.2)
13
+ nokogiri (1.4.4)
14
+ rspec (2.5.0)
15
+ rspec-core (~> 2.5.0)
16
+ rspec-expectations (~> 2.5.0)
17
+ rspec-mocks (~> 2.5.0)
18
+ rspec-core (2.5.1)
19
+ rspec-expectations (2.5.0)
20
+ diff-lcs (~> 1.1.2)
21
+ rspec-mocks (2.5.0)
22
+
23
+ PLATFORMS
24
+ ruby
25
+
26
+ DEPENDENCIES
27
+ ZenTest
28
+ activesupport (>= 3.0.0)
29
+ bundler (>= 1.0.0)
30
+ doc_wrapper!
31
+ nokogiri
32
+ rspec (>= 2.0.0)
@@ -0,0 +1,7 @@
1
+ module DocWrapper
2
+ module Base
3
+ def initialize (*args)
4
+ @documents = args.flatten
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,71 @@
1
+ module DocWrapper
2
+ module ClassMethods
3
+
4
+ # Add a property name to the singleton property_names attribute.
5
+ def add_property_name (property_name)
6
+ # Add the property name to the property_names collection.
7
+ self.property_names << property_name
8
+ end
9
+
10
+ # Add a property wrapper to the property_definitions Hash.
11
+ def add_property_wrapper (property_name, wrapper)
12
+ self.property_definitions[property_name] = wrapper
13
+ end
14
+
15
+ def add_property_accessor (property_name)
16
+ class_eval <<-END
17
+ def #{property_name.to_s}
18
+ property_definitions[:#{property_name.to_s}].property(documents)
19
+ end
20
+ END
21
+ end
22
+
23
+ # Set default options that all properties need.
24
+ def initialize_options (options)
25
+ # Make sure the options have a :document key with a value of 1.
26
+ # This forces all lookups to be for the 0th document in documents if
27
+ # the user did not specify an offset into the array.
28
+ { :document => 1 }.merge(options)
29
+ end
30
+
31
+ def add_property_definition (property_name, wrapper)
32
+ add_property_name(property_name)
33
+ add_property_wrapper(property_name, wrapper)
34
+ add_property_accessor(property_name)
35
+ end
36
+
37
+ # Create a typed property definition for a document wrapper.
38
+ # The property_name must be a symbol.
39
+ def property (property_name, type, selector, options = {}, &block)
40
+ raise "Unhandled property type: #{type.to_s}" if ![:string, :date, :time, :boolean, :raw].include?(type)
41
+ add_property_definition(property_name, build_property_definition(property_name, type, selector, initialize_options(options), block))
42
+ end
43
+
44
+ def build_property_definition (property_name, type, selector, options, block)
45
+ DocWrapper.const_get("#{camelize(type.to_s)}PropertyDefinition").new(property_name, type, selector, initialize_options(options), block)
46
+ end
47
+
48
+ def camelize (string)
49
+ string.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
50
+ end
51
+
52
+ def multi_property (property_name, selectors, options = {}, &block)
53
+ raise "Multi properties require a block" if block.nil?
54
+ add_property_definition(property_name, MultiPropertyDefinition.new(property_name, selectors, initialize_options(options), block))
55
+ end
56
+
57
+ def has_many (property_name, selector, klass, options = {})
58
+ options = initialize_options(options)
59
+ define_method property_name do
60
+ get_has_many( property_name, selector, klass, options)
61
+ end
62
+ end
63
+
64
+ def has_one (property_name, selector, klass, options = {})
65
+ options = initialize_options(options)
66
+ define_method(property_name) do
67
+ get_has_one(property_name, selector, klass, options)
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,25 @@
1
+ module DocWrapper
2
+ class BasePropertyDefinition
3
+ attr_accessor :property_name, :type, :selector, :options, :block
4
+
5
+ def initialize (property_name, type, selector, options, block)
6
+ @property_name = property_name
7
+ @type = type
8
+ @selector = selector
9
+ @options = options
10
+ @block = block
11
+ end
12
+
13
+ def property (documents)
14
+ begin
15
+ transform(documents[@options[:document] - 1].search(@selector))
16
+ rescue Nokogiri::CSS::SyntaxError
17
+ transform(documents[@options[:document] - 1].xpath(@selector))
18
+ end
19
+ end
20
+
21
+ def transform (result)
22
+ result
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,13 @@
1
+ module DocWrapper
2
+ class BooleanPropertyDefinition < InnerHtmlPropertyDefinition
3
+ def transform (result)
4
+ raise "BooleanPropertyDefinition: boolean_test is deprecated please use options[:parser] or a block." if options[:boolean_test]
5
+ if block
6
+ result = block.call(result)
7
+ else
8
+ result = result.blank? ? nil : (options[:parser] ? options[:parser].call(result) : options[:boolean_test].call(result) )
9
+ end
10
+ result
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,12 @@
1
+ module DocWrapper
2
+ class DatePropertyDefinition < InnerHtmlPropertyDefinition
3
+ def transform (result)
4
+ if block
5
+ result = block.call(result)
6
+ else
7
+ result = result.blank? ? nil : (options[:parser] ? options[:parser].call(result) : Date.parse(result))
8
+ end
9
+ result
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,11 @@
1
+ module DocWrapper
2
+ class InnerHtmlPropertyDefinition < BasePropertyDefinition
3
+ def property (documents)
4
+ begin
5
+ transform(documents[@options[:document] - 1].search(@selector).inner_html.strip)
6
+ rescue Nokogiri::CSS::SyntaxError
7
+ transform(documents[@options[:document] - 1].xpath(@selector).inner_html.strip)
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,32 @@
1
+ module DocWrapper
2
+ class MultiPropertyDefinition
3
+ attr_accessor :property_name, :selectors, :options, :block
4
+
5
+ def initialize (property_name, selectors, options, block)
6
+ @property_name = property_name
7
+ @selectors = selectors
8
+ @options = options
9
+ @block = block
10
+ end
11
+
12
+ def property (documents)
13
+ results = []
14
+ selectors.each do |selector|
15
+ nodes = documents[@options[:document] - 1].search(selector)
16
+ if nodes.respond_to? :inner_html
17
+ nodes.each do |node|
18
+ results << node.inner_html.strip
19
+ end
20
+ else
21
+ results << nodes.inner_html.strip
22
+ end
23
+ end
24
+ transform(results.flatten)
25
+ end
26
+
27
+ def transform (results)
28
+ return nil if results.size == 0
29
+ result = block.call(results)
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,39 @@
1
+ module DocWrapper
2
+ module Properties
3
+
4
+ attr_reader :documents
5
+
6
+ def self.included (base)
7
+ base.extend(SattrAccessor)
8
+ base.extend(ClassMethods)
9
+ base.sattr_accessor :property_names
10
+ base.sattr_accessor :property_definitions
11
+ base.property_names = []
12
+ base.property_definitions = {}
13
+ end
14
+
15
+ def properties
16
+ result = Hash.new
17
+ property_names.each do |property|
18
+ begin
19
+ result[property] = send(property)
20
+ rescue StandardError => e
21
+ raise e
22
+ end
23
+ end
24
+ result
25
+ end
26
+
27
+ def get_has_many (property_name, selector, klass, options)
28
+ nodes = @documents.collect { |doc| result = doc.search(selector) ; result.blank? ? nil : result }.flatten.compact
29
+ start_row = options[:start_row] ? options[:start_row] : 0
30
+ end_row = options[:end_row] ? options[:end_row] : nodes.size - 1
31
+ nodes[start_row..end_row].collect { |node| klass.new(node) }
32
+ end
33
+
34
+ def get_has_one (property_name, selector, klass, options)
35
+ nodes = @documents.collect { |doc| result = doc.search(selector) ; result.blank? ? nil : result }.flatten.compact
36
+ klass.new(nodes)
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,8 @@
1
+ module DocWrapper
2
+ class RawPropertyDefinition < BasePropertyDefinition
3
+ def transform (result)
4
+ result = block.call(result) if block
5
+ result
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,55 @@
1
+ module DocWrapper
2
+
3
+ module SattrAccessor
4
+
5
+ def sattr_reader (sym, options = {})
6
+ options = { :inheritable => false }.merge(options)
7
+ class_eval(<<-END, __FILE__, __LINE__)
8
+ def self.#{sym}
9
+ @#{sym}
10
+ end
11
+
12
+ def #{sym}
13
+ result = self.class.#{sym}
14
+ #{
15
+ "
16
+ if result.is_a?(Array)
17
+ # Get the value from our ancestor if there is one.
18
+ if self.class.superclass.respond_to? :#{sym}
19
+ result << self.class.superclass.#{sym}
20
+ end
21
+ result.flatten!
22
+ end
23
+ if result.is_a?(Hash)
24
+ # Get the value from our ancestor if there is one.
25
+ if self.class.superclass.respond_to? :#{sym}
26
+ result = self.class.superclass.#{sym}.merge(result)
27
+ end
28
+ end
29
+ " if options[:inheritable]
30
+ }
31
+ result
32
+ end
33
+ END
34
+ end
35
+
36
+ def sattr_writer (sym)
37
+ class_eval %Q{
38
+ def self.#{sym}= (value)
39
+ @#{sym} = value
40
+ end
41
+
42
+ def #{sym}= (value)
43
+ self.class.#{sym} = value
44
+ end
45
+ }
46
+ end
47
+
48
+ def sattr_accessor (sym, options = {})
49
+ sattr_reader(sym, options)
50
+ sattr_writer(sym)
51
+ end
52
+
53
+ end
54
+
55
+ end
@@ -0,0 +1,9 @@
1
+ module DocWrapper
2
+ class StringPropertyDefinition < InnerHtmlPropertyDefinition
3
+ def transform (result)
4
+ result = options[:parser].call(result) if !result.blank? && options[:parser]
5
+ result = block.call(result) if block
6
+ result
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,12 @@
1
+ module DocWrapper
2
+ class TimePropertyDefinition < InnerHtmlPropertyDefinition
3
+ def transform (result)
4
+ if block
5
+ result = block.call(result)
6
+ else
7
+ result = result.blank? ? nil : (options[:parser] ? options[:parser].call(result) : Time.parse(result))
8
+ end
9
+ result
10
+ end
11
+ end
12
+ end
@@ -1,3 +1,3 @@
1
1
  module DocWrapper
2
- VERSION = "0.0.1"
2
+ VERSION = "0.9.0"
3
3
  end
@@ -0,0 +1,171 @@
1
+ require 'spec_helper'
2
+
3
+
4
+
5
+ describe DocWrapper do
6
+
7
+ describe "The wrapped test documents" do
8
+ let(:doc1) do
9
+ Nokogiri::HTML(read_fixture_file("doc_wrapper_test.html"))
10
+ end
11
+
12
+ let(:doc2) do
13
+ Nokogiri::HTML(read_fixture_file("doc_wrapper_test_2.html"))
14
+ end
15
+
16
+ let(:document) do
17
+ TestDocWrapper.new([ doc1, doc2 ])
18
+ end
19
+
20
+ describe "#properties" do
21
+ let(:document_properties) do
22
+ document.properties
23
+ end
24
+
25
+ it { document_properties[:paragraph].should == 'A paragraph' }
26
+ it "should strip HTML &nbsp; from string properties" do
27
+ pending
28
+ document_properties[:space_example].should == 'Space Example'
29
+ end
30
+ it { document_properties[:active].should be_true }
31
+ it { document_properties[:date].should == Date.parse('12-Dec-2009') }
32
+ it { document_properties[:date_with_block].should == 'result from block' }
33
+ it { document_properties[:time].should == Time.parse('12-Dec-2009 12:31 PM') }
34
+ it { document_properties[:parsed_date].should == Date.parse('12-Dec-2009') }
35
+ it { document_properties[:parsed_time].should == Time.parse('12-Dec-2009 12:31 PM') }
36
+ it { document_properties[:table_data].should == 'A table data' }
37
+ it { document_properties[:paragraph2].should == 'A second paragraph' }
38
+ it { document_properties[:combined_tds].should == 'A table data A table data' }
39
+ it { document_properties[:combined_date_and_time].should == Time.parse('12-Dec-2009 12:31 PM') }
40
+ it { document_properties[:arrayed_xpath].should == 'name1 position1 salary1' }
41
+ it { document_properties.size.should == 14 }
42
+ end
43
+
44
+ it { document.should respond_to(:paragraph) }
45
+ it { document.should respond_to(:table_data) }
46
+ it { document.should respond_to(:date) }
47
+ it { document.should respond_to(:parsed_date) }
48
+ it { document.should respond_to(:combined_tds) }
49
+ it { document.should respond_to(:combined_date_and_time) }
50
+ it { document.should respond_to(:arrayed_xpath) }
51
+
52
+ it { document.paragraph.should == 'A paragraph' }
53
+ it { document.paragraph2.should == 'A second paragraph' }
54
+ it { document.table_data.should == 'A table data' }
55
+ it { document.date.should == Date.parse('12-Dec-2009') }
56
+ it { document.parsed_date.should == Date.parse('12-Dec-2009') }
57
+ it { document.date_with_block.should == 'result from block' }
58
+ it { document.active.should == true }
59
+ it { document.time.should == Time.parse('12-Dec-2009 12:31 PM') }
60
+ it "should strip HTML &nbsp; from string properties" do
61
+ pending
62
+ document.space_example.should == 'Space Example'
63
+ end
64
+ it { document.combined_tds.should == 'A table data A table data' }
65
+ it { document.combined_date_and_time.should == Time.parse('12-Dec-2009 12:31 PM') }
66
+ it { document.arrayed_xpath.should == 'name1 position1 salary1' }
67
+ it { document.raw_property.should == 'raw' }
68
+ it { document.line_items.size.should == 4 }
69
+
70
+ describe "paragraph_wrapper" do
71
+ let(:paragraph_wrapper) do
72
+ document.paragraph_wrapper
73
+ end
74
+
75
+ it { paragraph_wrapper.paragraph.should == 'A paragraph'}
76
+ end
77
+
78
+ describe "line item 0" do
79
+ let(:line_items) do
80
+ document.line_items[0]
81
+ end
82
+
83
+ it { line_items.name.should == 'name1' }
84
+ it { line_items.position.should == 'position1' }
85
+ it { line_items.salary.should == 'salary1' }
86
+ end
87
+
88
+ describe "line item 1" do
89
+ let(:line_items) do
90
+ document.line_items[1]
91
+ end
92
+
93
+ it { line_items.name.should == 'name2' }
94
+ it { line_items.position.should == 'position2' }
95
+ it { line_items.salary.should == 'salary2' }
96
+ end
97
+
98
+ describe "has_one person" do
99
+ let(:person) do
100
+ document.person
101
+ end
102
+
103
+ it { person.name.should == 'Mark Menard' }
104
+ it { person.home_town.should == 'Troy, NY' }
105
+ end
106
+
107
+ end
108
+ end
109
+
110
+ class TestDocLineItem
111
+ include DocWrapper::Base
112
+ include DocWrapper::Properties
113
+
114
+ property :name, :string, "./td[1]"
115
+ property :position, :string, "./td[2]"
116
+ property :salary, :string, "./td[3]"
117
+
118
+ end
119
+
120
+ class PersonWrapper
121
+ include DocWrapper::Properties
122
+ include DocWrapper::Base
123
+
124
+ property :name, :string, "./p[1]"
125
+ property :home_town, :string, "./p[2]"
126
+ end
127
+
128
+ class ParagraphWrapper
129
+ include DocWrapper::Properties
130
+ include DocWrapper::Base
131
+
132
+ property :paragraph, :string, '.'
133
+ end
134
+
135
+ class TestDocWrapper
136
+ include DocWrapper::Properties
137
+ include DocWrapper::Base
138
+
139
+ property :paragraph, :string, "/html/body/p[1]"
140
+ has_one :paragraph_wrapper, "/html/body/p[1]", ParagraphWrapper
141
+ property :space_example, :string, "/html/body/p[5]" do |x|
142
+ x.strip
143
+ end
144
+ property :active, :boolean, "/html/body/p[2]", :parser => lambda { |x| x == 'Yes' }
145
+ property :date, :date, "/html/body/p[3]"
146
+ property :parsed_date, :date, "/html/body/p[3]", :parser => lambda { |x| Date.parse(x) }
147
+ property :date_with_block, :date, "/html/body/p[3]" do |x|
148
+ "result from block"
149
+ end
150
+ property :time, :time, "/html/body/p[4]"
151
+ property :parsed_time, :time, "/html/body/p[4]", :parser => lambda { |x| Time.parse(x) }
152
+ property :table_data, :string, "/html/body/table[1]/tr/td[1]"
153
+ property :paragraph2, :string, "/html/body/p[1]", :document => 2
154
+ has_many :line_items, "/html/body/table[2]/tr", TestDocLineItem, :start_row => 1, :end_row => 4
155
+ multi_property :combined_tds, ["/html/body/table[1]/tr/td[1]", "/html/body/table[1]/tr/td[1]"] do |elements|
156
+ elements.join(" ")
157
+ end
158
+ multi_property :combined_date_and_time, ["/html/body/p[6]", "/html/body/p[7]"] do |elements|
159
+ Time.parse(elements.join(" "))
160
+ end
161
+
162
+ # Example of a multi_property that uses an XPath that returns an array of elements.
163
+ multi_property :arrayed_xpath, ["/html/body/table[2]/tr[2]/td"] do |elements|
164
+ elements.join(" ")
165
+ end
166
+ property :raw_property, :raw, "/html/body/p[8]" do |ns|
167
+ ns[0].attribute("class").inner_html
168
+ end
169
+
170
+ has_one :person, "/html/body/div", PersonWrapper
171
+ end
@@ -0,0 +1,29 @@
1
+ <html>
2
+ <body>
3
+ <p>A paragraph</p><!-- paragraph -->
4
+ <p>Yes</p><!-- active -->
5
+ <p>12-Dec-2009</p><!-- date -->
6
+ <p>12-Dec-2009 12:31 PM</p><!-- date and time -->
7
+ <p> &nbsp; Space Example&nbsp; &nbsp;</p>
8
+ <table>
9
+ <tr>
10
+ <td>A table data</td>
11
+ </tr>
12
+ </table>
13
+ <table class="has_many">
14
+ <tr><th colspan="3">header</th></tr>
15
+ <tr><td>name1</td><td>position1</td><td>salary1</td></tr>
16
+ <tr><td>name2</td><td>position2</td><td>salary2</td></tr>
17
+ <tr><td>name3</td><td>position3</td><td>salary3</td></tr>
18
+ <tr><td>name4</td><td>position4</td><td>salary4</td></tr>
19
+ <tr><th colspan="3">footer</th></tr>
20
+ </table>
21
+ <p>12-Dec-2009</p><!-- date -->
22
+ <p>12:31 PM</p><!-- time -->
23
+ <p class="raw">Raw paragraph.</p>
24
+ <div>
25
+ <p class="name">Mark Menard</p>
26
+ <p class="home_town">Troy, NY</p>
27
+ </div>
28
+ </body>
29
+ </html>
@@ -0,0 +1,5 @@
1
+ <html>
2
+ <body>
3
+ <p>A second paragraph</p>
4
+ </body>
5
+ </html>
@@ -0,0 +1,6 @@
1
+ require File.expand_path("../../lib/doc_wrapper", __FILE__)
2
+ require 'nokogiri'
3
+
4
+ def read_fixture_file (file_name)
5
+ File.open(File.join(File.dirname(__FILE__), "/fixtures/#{file_name}")).read
6
+ end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: doc_wrapper
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
4
+ hash: 59
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
+ - 9
8
9
  - 0
9
- - 1
10
- version: 0.0.1
10
+ version: 0.9.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Mark Menard
@@ -106,10 +106,27 @@ extra_rdoc_files: []
106
106
  files:
107
107
  - .gitignore
108
108
  - Gemfile
109
+ - Gemfile.lock
109
110
  - Rakefile
110
111
  - doc_wrapper.gemspec
111
112
  - lib/doc_wrapper.rb
113
+ - lib/doc_wrapper/base.rb
114
+ - lib/doc_wrapper/base_class_methods.rb
115
+ - lib/doc_wrapper/base_property_definition.rb
116
+ - lib/doc_wrapper/boolean_property_definition.rb
117
+ - lib/doc_wrapper/date_property_definition.rb
118
+ - lib/doc_wrapper/inner_html_property_definition.rb
119
+ - lib/doc_wrapper/multi_property_definition.rb
120
+ - lib/doc_wrapper/properties.rb
121
+ - lib/doc_wrapper/raw_property_definition.rb
122
+ - lib/doc_wrapper/sattr_accessor.rb
123
+ - lib/doc_wrapper/string_property_definition.rb
124
+ - lib/doc_wrapper/time_property_definition.rb
112
125
  - lib/doc_wrapper/version.rb
126
+ - spec/doc_wrapper_spec.rb
127
+ - spec/fixtures/doc_wrapper_test.html
128
+ - spec/fixtures/doc_wrapper_test_2.html
129
+ - spec/spec_helper.rb
113
130
  has_rdoc: true
114
131
  homepage: http://rubygems.org/gems/doc_wrapper
115
132
  licenses: []