sax-machine 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +2 -0
  3. data/.travis.yml +14 -4
  4. data/Gemfile +5 -1
  5. data/Guardfile +2 -2
  6. data/HISTORY.md +23 -6
  7. data/README.md +111 -40
  8. data/Rakefile +4 -3
  9. data/lib/sax-machine.rb +11 -2
  10. data/lib/sax-machine/{sax_ancestor_config.rb → config/sax_ancestor.rb} +3 -7
  11. data/lib/sax-machine/{sax_attribute_config.rb → config/sax_attribute.rb} +4 -6
  12. data/lib/sax-machine/{sax_collection_config.rb → config/sax_collection.rb} +6 -10
  13. data/lib/sax-machine/{sax_element_config.rb → config/sax_element.rb} +16 -17
  14. data/lib/sax-machine/{sax_element_value_config.rb → config/sax_element_value.rb} +5 -7
  15. data/lib/sax-machine/{sax_handler.rb → handlers/sax_abstract_handler.rb} +28 -32
  16. data/lib/sax-machine/handlers/sax_nokogiri_handler.rb +16 -0
  17. data/lib/sax-machine/handlers/sax_ox_handler.rb +41 -0
  18. data/lib/sax-machine/sax_config.rb +9 -9
  19. data/lib/sax-machine/sax_configure.rb +1 -6
  20. data/lib/sax-machine/sax_document.rb +28 -17
  21. data/lib/sax-machine/version.rb +2 -2
  22. data/sax-machine.gemspec +8 -11
  23. data/spec/fixtures/atom-content.html +15 -0
  24. data/spec/{sax-machine → fixtures}/atom.xml +0 -0
  25. data/spec/sax-machine/sax_activerecord_spec.rb +23 -0
  26. data/spec/sax-machine/sax_configure_spec.rb +48 -0
  27. data/spec/sax-machine/sax_document_spec.rb +333 -280
  28. data/spec/sax-machine/sax_include_spec.rb +43 -0
  29. data/spec/spec_helper.rb +11 -2
  30. metadata +36 -41
  31. data/spec/benchmarks/amazon.xml +0 -40
  32. data/spec/benchmarks/benchmark.rb +0 -158
  33. data/spec/benchmarks/public_timeline.xml +0 -411
  34. data/spec/sax-machine/configure_sax_machine_spec.rb +0 -53
  35. data/spec/sax-machine/include_sax_machine_spec.rb +0 -42
@@ -1,13 +1,12 @@
1
1
  module SAXMachine
2
2
  class SAXConfig
3
-
4
3
  class ElementValueConfig
5
4
  attr_reader :name, :setter
6
5
 
7
6
  def initialize(name, options)
8
- @name = name.to_s
9
- @as = options[:as]
10
- @setter = "#{@as}="
7
+ @name = name.to_s
8
+ @as = options[:as]
9
+ @setter = "#{@as}="
11
10
  @required = options[:required]
12
11
  end
13
12
 
@@ -16,9 +15,8 @@ module SAXMachine
16
15
  end
17
16
 
18
17
  def required?
19
- @required
18
+ !!@required
20
19
  end
21
20
  end
22
-
23
21
  end
24
- end
22
+ end
@@ -1,8 +1,7 @@
1
- require "nokogiri"
2
- require "time"
1
+ require 'time'
3
2
 
4
3
  module SAXMachine
5
- class SAXHandler < Nokogiri::XML::SAX::Document
4
+ module SAXAbstractHandler
6
5
  NO_BUFFER = :no_buffer
7
6
 
8
7
  class StackNode < Struct.new(:object, :config, :buffer)
@@ -13,14 +12,14 @@ module SAXMachine
13
12
  end
14
13
  end
15
14
 
16
- def initialize(object, on_error = nil, on_warning = nil)
15
+ def _initialize(object, on_error = nil, on_warning = nil)
17
16
  @stack = [ StackNode.new(object) ]
18
17
  @parsed_configs = {}
19
18
  @on_error = on_error
20
19
  @on_warning = on_warning
21
20
  end
22
21
 
23
- def characters(data)
22
+ def _characters(data)
24
23
  node = stack.last
25
24
 
26
25
  if node.buffer == NO_BUFFER
@@ -29,10 +28,8 @@ module SAXMachine
29
28
  node.buffer << data
30
29
  end
31
30
  end
32
- alias cdata_block characters
33
-
34
- def start_element(name, attrs = [])
35
31
 
32
+ def _start_element(name, attrs = [])
36
33
  name = normalize_name(name)
37
34
  node = stack.last
38
35
  object = node.object
@@ -61,10 +58,10 @@ module SAXMachine
61
58
  if !collection_config && element_config = sax_config.element_config_for_tag(name, attrs)
62
59
  new_object =
63
60
  case element_config.data_class.to_s
64
- when 'Integer' then 0
65
- when 'Float' then 0.0
66
- when 'Time' then Time.at(0)
67
- when '' then object
61
+ when "Integer" then 0
62
+ when "Float" then 0.0
63
+ when "Time" then Time.at(0)
64
+ when "" then object
68
65
  else
69
66
  element_config.data_class.new
70
67
  end
@@ -76,7 +73,7 @@ module SAXMachine
76
73
  end
77
74
  end
78
75
 
79
- def end_element(name)
76
+ def _end_element(name)
80
77
  name = normalize_name(name)
81
78
 
82
79
  start_tag = stack[-2]
@@ -107,13 +104,13 @@ module SAXMachine
107
104
  else
108
105
  value =
109
106
  case config.data_class.to_s
110
- when 'String' then value.to_s
111
- when 'Integer' then value.to_i
112
- when 'Float' then value.to_f
107
+ when "String" then value.to_s
108
+ when "Integer" then value.to_i
109
+ when "Float" then value.to_f
113
110
  # Assumes that time elements will be string-based and are not
114
111
  # something else, e.g. seconds since epoch
115
- when 'Time' then Time.parse(value.to_s)
116
- when '' then value
112
+ when "Time" then Time.parse(value.to_s)
113
+ when "" then value
117
114
  else
118
115
  element
119
116
  end
@@ -134,30 +131,29 @@ module SAXMachine
134
131
  stack.pop
135
132
  end
136
133
 
137
- private
138
-
139
- def mark_as_parsed(object, element_config)
140
- unless element_config.collection?
141
- @parsed_configs[[object.object_id, element_config.object_id]] = true
134
+ def _error(string)
135
+ if @on_error
136
+ @on_error.call(string)
142
137
  end
143
138
  end
144
139
 
145
- def parsed_config?(object, element_config)
146
- @parsed_configs[[object.object_id, element_config.object_id]]
147
- end
148
-
149
- def warning(string)
140
+ def _warning(string)
150
141
  if @on_warning
151
142
  @on_warning.call(string)
152
143
  end
153
144
  end
154
145
 
155
- def error(string)
156
- if @on_error
157
- @on_error.call(string)
146
+ private
147
+
148
+ def mark_as_parsed(object, element_config)
149
+ unless element_config.collection?
150
+ @parsed_configs[[object.object_id, element_config.object_id]] = true
158
151
  end
159
152
  end
160
153
 
154
+ def parsed_config?(object, element_config)
155
+ @parsed_configs[[object.object_id, element_config.object_id]]
156
+ end
161
157
 
162
158
  def sax_config_for(object)
163
159
  if object.class.respond_to?(:sax_config)
@@ -172,7 +168,7 @@ module SAXMachine
172
168
  end
173
169
 
174
170
  def normalize_name(name)
175
- name.gsub(/\-/, '_')
171
+ name.gsub(/\-/, "_")
176
172
  end
177
173
 
178
174
  def set_attributes_on(object, attributes)
@@ -0,0 +1,16 @@
1
+ require 'sax-machine/handlers/sax_abstract_handler'
2
+ require 'nokogiri'
3
+
4
+ module SAXMachine
5
+ class SAXNokogiriHandler < Nokogiri::XML::SAX::Document
6
+ include SAXAbstractHandler
7
+
8
+ alias_method :initialize, :_initialize
9
+ alias_method :characters, :_characters
10
+ alias_method :cdata_block, :_characters
11
+ alias_method :start_element, :_start_element
12
+ alias_method :end_element, :_end_element
13
+ alias_method :error, :_error
14
+ alias_method :warning, :_warning
15
+ end
16
+ end
@@ -0,0 +1,41 @@
1
+ require 'sax-machine/handlers/sax_abstract_handler'
2
+ require 'ox'
3
+
4
+ module SAXMachine
5
+ class SAXOxHandler < Ox::Sax
6
+ include SAXAbstractHandler
7
+
8
+ def initialize(*args)
9
+ _initialize(*args)
10
+ _reset_element
11
+ end
12
+
13
+ def attr(name, str)
14
+ @attrs[name] = str
15
+ end
16
+
17
+ def attrs_done
18
+ _start_element(@element, @attrs)
19
+ _reset_element
20
+ end
21
+
22
+ def start_element(name)
23
+ @element = name
24
+ end
25
+
26
+ def error(message, line, column)
27
+ _error("#{message} on line #{line} column #{column}")
28
+ end
29
+
30
+ alias_method :text, :_characters
31
+ alias_method :cdata, :_characters
32
+ alias_method :end_element, :_end_element
33
+
34
+ private
35
+
36
+ def _reset_element
37
+ @attrs = {}
38
+ @element = ""
39
+ end
40
+ end
41
+ end
@@ -1,17 +1,16 @@
1
- require "sax-machine/sax_attribute_config"
2
- require "sax-machine/sax_element_value_config"
3
- require "sax-machine/sax_element_config"
4
- require "sax-machine/sax_collection_config"
5
- require "sax-machine/sax_ancestor_config"
1
+ require "sax-machine/config/sax_attribute"
2
+ require "sax-machine/config/sax_element_value"
3
+ require "sax-machine/config/sax_element"
4
+ require "sax-machine/config/sax_collection"
5
+ require "sax-machine/config/sax_ancestor"
6
6
 
7
7
  module SAXMachine
8
8
  class SAXConfig
9
-
10
9
  attr_accessor :top_level_elements, :top_level_attributes, :top_level_element_value, :collection_elements, :ancestors
11
10
 
12
11
  def initialize
13
12
  # Default value is an empty array
14
- @top_level_elements = Hash.new { |hash, key| hash[key] = [] }
13
+ @top_level_elements = Hash.new { |hash, key| hash[key] = [] }
15
14
  @top_level_attributes = []
16
15
  @top_level_element_value = []
17
16
  @collection_elements = Hash.new { |hash, key| hash[key] = [] }
@@ -19,11 +18,12 @@ module SAXMachine
19
18
  end
20
19
 
21
20
  def columns
22
- @top_level_elements.map {|name, ecs| ecs }.flatten
21
+ @top_level_elements.map { |name, ecs| ecs }.flatten
23
22
  end
24
23
 
25
24
  def initialize_copy(sax_config)
26
25
  super
26
+
27
27
  @top_level_elements = sax_config.top_level_elements.clone
28
28
  @top_level_attributes = sax_config.top_level_attributes.clone
29
29
  @top_level_element_value = sax_config.top_level_element_value.clone
@@ -71,4 +71,4 @@ module SAXMachine
71
71
  @top_level_elements[name.to_s].detect { |ec| ec.attrs_match?(attrs) }
72
72
  end
73
73
  end
74
- end
74
+ end
@@ -1,5 +1,4 @@
1
1
  module SAXMachine
2
-
3
2
  def self.configure(clazz)
4
3
  extended_clazz = Class.new(clazz)
5
4
  extended_clazz.send(:include, SAXMachine)
@@ -18,11 +17,9 @@ module SAXMachine
18
17
  (class << clazz;self;end).send(:define_method, :parse) do |xml_text|
19
18
  extended_clazz.parse(xml_text)
20
19
  end
21
-
22
20
  end
23
21
 
24
22
  module LightWeightSaxMachine
25
-
26
23
  attr_writer :sax_config
27
24
 
28
25
  def sax_config
@@ -32,7 +29,5 @@ module SAXMachine
32
29
  def inherited(subclass)
33
30
  subclass.sax_config.send(:initialize_copy, self.sax_config)
34
31
  end
35
-
36
32
  end
37
-
38
- end
33
+ end
@@ -1,18 +1,30 @@
1
1
  require "nokogiri"
2
2
 
3
3
  module SAXMachine
4
-
5
4
  def self.included(base)
6
5
  base.send(:include, InstanceMethods)
7
- base.extend ClassMethods
6
+ base.extend(ClassMethods)
8
7
  end
9
8
 
10
9
  def parse(xml_text, on_error = nil, on_warning = nil)
11
- sax_handler = SAXHandler.new(self, on_error, on_warning)
12
- parser = Nokogiri::XML::SAX::Parser.new(sax_handler)
13
- parser.parse(xml_text) do |ctx|
14
- ctx.replace_entities = true
10
+ if SAXMachine.handler == :ox
11
+ Ox.sax_parse(
12
+ SAXOxHandler.new(self, on_error, on_warning),
13
+ StringIO.new(xml_text),
14
+ {
15
+ symbolize: false,
16
+ convert_special: true,
17
+ skip: :skip_return,
18
+ }
19
+ )
20
+ else
21
+ handler = SAXNokogiriHandler.new(self, on_error, on_warning)
22
+ parser = Nokogiri::XML::SAX::Parser.new(handler)
23
+ parser.parse(xml_text) do |ctx|
24
+ ctx.replace_entities = true
25
+ end
15
26
  end
27
+
16
28
  self
17
29
  end
18
30
 
@@ -25,13 +37,12 @@ module SAXMachine
25
37
  end
26
38
 
27
39
  module ClassMethods
28
-
29
40
  def inherited(subclass)
30
41
  subclass.sax_config.send(:initialize_copy, self.sax_config)
31
42
  end
32
43
 
33
- def parse(xml_text, on_error = nil, on_warning = nil)
34
- new.parse(xml_text, on_error, on_warning)
44
+ def parse(*args)
45
+ new.parse(*args)
35
46
  end
36
47
 
37
48
  def element(name, options = {})
@@ -42,13 +53,13 @@ module SAXMachine
42
53
 
43
54
  def attribute(name, options = {})
44
55
  real_name = (options[:as] ||= name).to_s
45
- sax_config.add_top_level_attribute(self.class.to_s, options.merge(:name => name))
56
+ sax_config.add_top_level_attribute(self.class.to_s, options.merge(name: name))
46
57
  create_attr real_name
47
58
  end
48
59
 
49
60
  def value(name, options = {})
50
61
  real_name = (options[:as] ||= name).to_s
51
- sax_config.add_top_level_element_value(self.class.to_s, options.merge(:name => name))
62
+ sax_config.add_top_level_element_value(self.class.to_s, options.merge(name: name))
52
63
  create_attr real_name
53
64
  end
54
65
 
@@ -80,6 +91,7 @@ module SAXMachine
80
91
 
81
92
  def elements(name, options = {})
82
93
  options[:as] ||= name
94
+
83
95
  if options[:class]
84
96
  sax_config.add_collection_element(name, options)
85
97
  else
@@ -88,7 +100,7 @@ module SAXMachine
88
100
  #{options[:as]} << value
89
101
  end
90
102
  SRC
91
- sax_config.add_top_level_element(name, options.merge(:collection => true))
103
+ sax_config.add_top_level_element(name, options.merge(collection: true))
92
104
  end
93
105
 
94
106
  if !method_defined?(options[:as].to_s)
@@ -99,7 +111,7 @@ module SAXMachine
99
111
  SRC
100
112
  end
101
113
 
102
- attr_writer options[:as] unless method_defined?("#{options[:as]}=")
114
+ attr_writer(options[:as]) unless method_defined?("#{options[:as]}=")
103
115
  end
104
116
 
105
117
  def sax_config
@@ -109,10 +121,9 @@ module SAXMachine
109
121
  # we only want to insert the getter and setter if they haven't defined it from elsewhere.
110
122
  # this is how we allow custom parsing behavior. So you could define the setter
111
123
  # and have it parse the string into a date or whatever.
112
- def create_attr real_name
113
- attr_reader real_name unless method_defined?(real_name)
114
- attr_writer real_name unless method_defined?("#{real_name}=")
124
+ def create_attr(real_name)
125
+ attr_reader(real_name) unless method_defined?(real_name)
126
+ attr_writer(real_name) unless method_defined?("#{real_name}=")
115
127
  end
116
128
  end
117
-
118
129
  end
@@ -1,3 +1,3 @@
1
1
  module SAXMachine
2
- VERSION = "0.2.1"
3
- end
2
+ VERSION = "0.3.0"
3
+ end
@@ -1,24 +1,21 @@
1
1
  # -*- encoding: utf-8 -*-
2
- require File.expand_path('../lib/sax-machine/version', __FILE__)
2
+ require File.expand_path("../lib/sax-machine/version", __FILE__)
3
3
 
4
4
  Gem::Specification.new do |s|
5
- s.name = 'sax-machine'
6
- s.version = SAXMachine::VERSION
5
+ s.name = "sax-machine"
6
+ s.version = SAXMachine::VERSION
7
7
 
8
- s.authors = ["Paul Dix", "Julien Kirch", "Ezekiel Templin"]
8
+ s.authors = ["Paul Dix", "Julien Kirch", "Ezekiel Templin", "Dmitry Krasnoukhov"]
9
9
  s.email = %q{paul@pauldix.net}
10
10
  s.homepage = %q{http://github.com/pauldix/sax-machine}
11
-
12
- s.summary = %q{Declarative SAX Parsing with Nokogiri}
13
-
11
+ s.summary = %q{Declarative SAX Parsing with Nokogiri or Ox}
14
12
  s.license = %q{MIT}
15
13
 
16
14
  s.files = `git ls-files`.split("\n")
17
15
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
16
  s.require_paths = ["lib"]
17
+ s.platform = Gem::Platform::RUBY
19
18
 
20
- s.platform = Gem::Platform::RUBY
21
-
22
- s.add_dependency 'nokogiri', "~> 1.6.0"
23
- s.add_development_dependency "rspec", "~> 2.13.0"
19
+ s.add_dependency "nokogiri", "~> 1.6.0"
20
+ s.add_development_dependency "rspec", "~> 3.0"
24
21
  end
@@ -0,0 +1,15 @@
1
+
2
+ <div xmlns="http://www.w3.org/1999/xhtml"><p>In my previous <a href="http://www.pauldix.net/2008/08/serializing-dat.html">post about the speed of serializing data</a>, I concluded that Marshal was the quickest way to get things done. So I set about using Marshal to store some data in an ActiveRecord object. Things worked great at first, but on some test data I got this error: marshal data too short. Luckily, <a href="http://www.brynary.com/">Bryan Helmkamp</a> had helpfully pointed out that there were sometimes problems with storing marshaled data in the database. He said it was best to base64 encode the marshal dump before storing.</p>
3
+
4
+ <p>I was curious why it was working on some things and not others. It turns out that some types of data being marshaled were causing the error to pop up. Here's the test data I used in my specs:</p>
5
+ <pre>{ :foo =&gt; 3, :bar =&gt; 2 } # hash with symbols for keys and integer values<br />[3, 2.1, 4, 8]&nbsp; &nbsp;&nbsp; &nbsp;&nbsp; &nbsp;&nbsp; # array with integer and float values</pre>
6
+ <p>Everything worked when I switched the array values to all integers so it seems that floats were causing the problem. However, in the interest of keeping everything working regardless of data types, I base64 encoded before going into the database and decoded on the way out.</p>
7
+
8
+ <p>I also ran the benchmarks again to determine what impact this would have on speed. Here are the results for 100 iterations on a 10k element array and a 10k element hash with and without base64 encode/decode:</p>
9
+ <pre>&nbsp; &nbsp;&nbsp; &nbsp;&nbsp; &nbsp;&nbsp; &nbsp;&nbsp; &nbsp; user&nbsp; &nbsp;&nbsp; &nbsp; system&nbsp; &nbsp;&nbsp; total&nbsp; &nbsp;&nbsp; &nbsp; real<br />array marshal&nbsp; 0.200000&nbsp; &nbsp;0.010000&nbsp; &nbsp;0.210000 (&nbsp; 0.214018) (without Base64)<br />array marshal&nbsp; 0.220000&nbsp; &nbsp;0.010000&nbsp; &nbsp;0.230000 (&nbsp; 0.250260)<br /><br />hash marshal&nbsp; &nbsp;1.830000&nbsp; &nbsp;0.040000&nbsp; &nbsp;1.870000 (&nbsp; 1.892874) (without Base64)<br />hash marshal&nbsp; &nbsp;2.040000&nbsp; &nbsp;0.100000&nbsp; &nbsp;2.140000 (&nbsp; 2.170405)</pre>
10
+ <p>As you can see the difference in speed is pretty negligible. I assume that the error has to do with AR cleaning the stuff that gets inserted into the database, but I'm not really sure. In the end it's just easier to use Base64.encode64 when serializing data into a text field in ActiveRecord using Marshal.</p>
11
+
12
+ <p>I've also read people posting about this error when using the database session store. I can only assume that it's because they were trying to store either way too much data in their session (too much for a regular text field) or they were storing float values or some other data type that would cause this to pop up. Hopefully this helps.</p></div>
13
+ <div class="feedflare">
14
+ <a href="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?a=rWfWO"><img src="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?i=rWfWO" border="0"></img></a> <a href="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?a=RaCqo"><img src="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?i=RaCqo" border="0"></img></a> <a href="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?a=1CBLo"><img src="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?i=1CBLo" border="0"></img></a>
15
+ </div><img src="http://feeds.feedburner.com/~r/PaulDixExplainsNothing/~4/383536354" height="1" width="1"/>