axml 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README CHANGED
@@ -1,30 +1,27 @@
1
1
  AXML
2
2
  ====
3
3
 
4
- AXML - Provides a simple DOM for working with XML (using XMLParser under the
5
- hood) that can serve as a drop in replacement for a subset of basic libxml
6
- functionality (e.g., each, children, child, find_first, find, next).
4
+ AXML - Provides a simple, minimalistic DOM for working with data stored in an
5
+ XML document. The API is very similar to LibXML, differing slightly in the
6
+ handling of text nodes. It is designed with very large documents in mind: nodes are represented in memory efficient Struct objects and it works with either XMLParser or LibXML!
7
7
 
8
- 'AXML' means 'ax XML' which succinctly describes the occasional feeling of a
9
- programmer towards XML or its myriad parsers. AXML won't solve all your
10
- problems, but it does make working with XML much less painful.
8
+ 'AXML' literally translates into 'ax XML' which succinctly describes the
9
+ occasional feeling of a programmer towards XML or its myriad parsers. AXML
10
+ won't solve all your XML woes, but it does make working with XML much less
11
+ painful.
11
12
 
12
13
  Features
13
14
  --------
14
15
 
15
- * *fast*: it's implemented in XMLParser (expat under the hood)
16
- * *lean*: as in 'lines of code' (~220 w/ blank lines) and as in 'memory consumption' (nodes implemented as Struct, children in Array)
16
+ * *fast*: runs on either XMLParser or LibXML
17
+ * *lean*: as in 'lines of code' and as in 'memory consumption' (nodes implemented as Struct, children in Array)
17
18
  * *easy to extend*: code your Grandmother could read and understand (if she reads ruby)
18
- * *quacks like libxml*: implements a very useful subset of libxml methods for near drop in replacement.
19
-
19
+ * PLOS: implements a useful subset of libxml methods for near drop in replacement.
20
20
 
21
21
  Examples
22
22
  --------
23
23
 
24
- require 'axml' # currently requires 'xmlparser' be installed
25
- # Windows: already in one-click-installer
26
- # Ubuntu: sudo apt-get install libxml-parser-ruby1.8
27
- # Cygwin: see http://mspire.rubyforge.org/tutorial/cygwin_mspire.html
24
+ require 'axml'
28
25
 
29
26
  # a little example xml string to use
30
27
  string_or_io = "
@@ -39,18 +36,24 @@ Examples
39
36
  </n1>
40
37
  "
41
38
 
42
- ### Read a string or io
39
+ ### Read a string, io, or file
43
40
 
44
41
  n1_node = AXML.parse(string_or_io)
45
-
46
- ### Read a file
47
-
48
- n1_node = AXML.parse_file('path/to/file')
42
+ # --or--
43
+ n1_node = AXML.parse('path/to/file')
49
44
 
50
45
  ### Access children
51
46
 
52
47
  n1_node.children # -> [array]
53
- n1_node.each {|child| # do something with child }
48
+ n1_node.each {|child| # do something with each child }
49
+
50
+ ### Traverse the whole tree structure
51
+
52
+ n1_node.traverse do |node|
53
+ # pre traversal
54
+ end
55
+
56
+ n1_node.traverse(:post) {|node| # post traversal }
54
57
 
55
58
  ### Get attributes and text
56
59
 
@@ -59,7 +62,7 @@ Examples
59
62
  n3_node.text # -> 'words here'
60
63
  n3_node.content # -> [same]
61
64
 
62
- ### Traverse nodes with next and child
65
+ ### Navigate nodes
63
66
 
64
67
  n2_node = n1_node.child
65
68
  the_other_n2_node = n2_node.next
@@ -71,26 +74,12 @@ Examples
71
74
  n3_node = n1_node.find_first('descendant::n3')
72
75
  other_n3_node = n3_node.find_first('following-sibling::n3')
73
76
  n1_node.find_first('child::n3') # -> nil
77
+ # also callable as find_first_child and find_first_descendant
74
78
 
75
79
  # find (returns an array)
76
- n1_node.find('descendant::n3') # -> [array of all 3 <n3> nodes]
77
80
  n1_node.find('child::n2') # -> [array of 2 <n2> nodes]
78
-
79
- ### Switch to libxml
80
-
81
- This is all it takes to get all of the above code to work under libxml:
82
-
83
- require 'xml/libxml' # instead of: require 'axml'
84
-
85
- # A file
86
- REPLACE: n1_node = AXML.parse_file(file)
87
- WITH: n1_node = XML::Document.file(file).root # note the .root call on the end!
88
-
89
- # A string
90
- REPLACE: n1_node = AXML.parse(string)
91
- WITH: n1_node = XML::Parser.string(string).parse.root # note the .root call on the end!
92
-
93
- Wallah! All the above method calls work under libxml
81
+ n1_node.find('descendant::n3') # -> [array of all 3 <n3> nodes]
82
+ # also callable as find_child and find_descendant
94
83
 
95
84
 
96
85
  See `specs/axml_spec.rb` for more examples and functionality
@@ -107,3 +96,10 @@ Installation
107
96
  ------------
108
97
 
109
98
  gem install axml
99
+
100
+ See Also
101
+ --------
102
+
103
+ If you are parsing HTML or complex word processing documents this is not the parser for you. Try something like hpricot or LibXML.
104
+
105
+
data/Rakefile CHANGED
@@ -2,9 +2,9 @@ require 'rake'
2
2
  require 'rubygems'
3
3
  require 'rake/rdoctask'
4
4
  require 'rake/gempackagetask'
5
+ require 'rake/testtask'
5
6
  require 'rake/clean'
6
7
  require 'fileutils'
7
- #require 'spec/rake/spectask'
8
8
  require 'email_encrypt'
9
9
 
10
10
  ###############################################
@@ -59,7 +59,7 @@ task :html_docs do
59
59
 
60
60
  # add contact info:
61
61
  index.puts '<h2>Contact</h2>'
62
- index.puts 'jprince@icmb.utexas.edu'.email_encrypt
62
+ index.puts 'jtprince@gmail.com'.email_encrypt
63
63
 
64
64
  index.puts '</body></html>'
65
65
  end
@@ -75,64 +75,20 @@ end
75
75
  # TESTS
76
76
  ###############################################
77
77
 
78
-
79
- task :ensure_gem_is_uninstalled do
80
- reply = `#{$gemcmd} list -l #{NAME}`
81
- if reply.include? NAME + " ("
82
- puts "GOING to uninstall gem '#{NAME}' for testing"
83
- if WIN32
84
- %x( #{$gemcmd} uninstall -x #{NAME} )
85
- else
86
- %x( sudo #{$gemcmd} uninstall -x #{NAME} )
87
- end
88
- end
78
+ desc 'Default: Run specs.'
79
+ task :default => :spec
80
+
81
+ desc 'Run specs.'
82
+ Rake::TestTask.new(:spec) do |t|
83
+ t.verbose = true
84
+ t.warning = true
85
+ ENV['RUBYOPT'] = 'rubygems'
86
+ ENV['TEST'] = ENV['SPEC'] if ENV['SPEC']
87
+ t.libs = ['lib']
88
+ t.test_files = Dir.glob( File.join('spec', ENV['pattern'] || '**/*_spec.rb') )
89
+ t.options = "-v"
89
90
  end
90
91
 
91
- #namespace :spec do
92
- # task :autotest do
93
- # require './specs/rspec_autotest'
94
- # RspecAutotest.run
95
- # end
96
- #end
97
-
98
- #desc "Run specs"
99
- #Spec::Rake::SpecTask.new('spec') do |t|
100
- # Rake::Task[:ensure_gem_is_uninstalled].invoke
101
- # t.libs = ['lib']
102
- # t.spec_files = FileList['specs/**/*_spec.rb']
103
- #end
104
-
105
- #desc "Run specs and output specdoc"
106
- #Spec::Rake::SpecTask.new('specl') do |t|
107
- # Rake::Task[:ensure_gem_is_uninstalled].invoke
108
- # t.spec_files = FileList['specs/**/*_spec.rb']
109
- # t.libs = ['lib']
110
- # t.spec_opts = ['--format', 'specdoc' ]
111
- #end
112
-
113
- #desc "Run all specs with RCov"
114
- #Spec::Rake::SpecTask.new('rcov') do |t|
115
- # Rake::Task[:ensure_gem_is_uninstalled].invoke
116
- # t.spec_files = FileList['specs/**/*_spec.rb']
117
- # t.rcov = true
118
- # t.libs = ['lib']
119
- # t.rcov_opts = ['--exclude', 'specs']
120
- #end
121
-
122
- #task :spec do
123
- # uninstall_gem
124
- # # files that match a key word
125
- # files_to_run = ENV['SPEC'] || FileList['specs/**/*_spec.rb']
126
- # if ENV['SPECM']
127
- # files_to_run = files_to_run.select do |file|
128
- # file.include?(ENV['SPECM'])
129
- # end
130
- # end
131
- # files_to_run.each do |spc|
132
- # system "ruby -I lib -S spec #{spc} --format specdoc"
133
- # end
134
- #end
135
-
136
92
  ###############################################
137
93
  # PACKAGE / INSTALL / UNINSTALL
138
94
  ###############################################
@@ -191,9 +147,11 @@ gemspec = Gem::Specification.new do |t|
191
147
  t.platform = Gem::Platform::RUBY
192
148
  t.name = NAME
193
149
  t.version = IO.readlines(changelog).grep(/##.*version/).pop.split(/\s+/).last.chomp
150
+ t.homepage = 'http://axml.rubyforge.org/'
151
+ t.rubyforge_project = 'axml'
194
152
  t.summary = summary
195
153
  t.date = "#{tm.year}-#{tm.month}-#{tm.day}"
196
- t.email = "jprince@icmb.utexas.edu"
154
+ t.email = "jtprince@gmail.com"
197
155
  t.description = description
198
156
  t.has_rdoc = true
199
157
  t.authors = ["John Prince"]
@@ -201,8 +159,8 @@ gemspec = Gem::Specification.new do |t|
201
159
  t.rdoc_options = rdoc_options
202
160
  t.extra_rdoc_files = rdoc_extra_includes
203
161
  t.executables = FL["bin/*"].map {|file| File.basename(file) }
204
- t.requirements << 'xmlparser is needed right now'
205
- t.test_files = FL["specs/*_spec.rb"]
162
+ t.requirements << 'xmlparser or libxml'
163
+ t.test_files = FL["spec/**/*_spec.rb"]
206
164
  end
207
165
 
208
166
  desc "Create packages."
data/lib/axml.rb CHANGED
@@ -1,376 +1,39 @@
1
- require 'xmlparser'
2
1
 
3
- class AXML
4
-
5
- NotBlankText_re = /[^\s+]+/m
6
-
7
- def self.parse_file(file)
8
- root = nil
9
- File.open(file) {|fh| root = parse(fh) }
10
- root
11
- end
12
-
13
- # Returns the root node (as Element) or nodes (as Array)
14
- # options:
15
- # :keep_blanks => *true | false
16
- def self.parse(stream, opts={:keep_blanks => false})
17
- parser = AXML::XMLParser.new
18
- if opts[:keep_blanks] == false
19
- parser.set_no_keep_blanks
20
- end
21
- if ti = opts[:text_indices]
22
- if ti.is_a?(Array) && ti.size > 1
23
- raise NotImplementedError, "currently only supports a single element"
24
- else
25
- ti =
26
- if ti.is_a?(Array)
27
- ti.first.to_s
28
- else
29
- ti.to_s
30
- end
31
- parser.set_single_text_indices(ti)
32
- end
33
- end
34
- parser.parse(stream)
35
- parser.root
36
- end
37
-
38
- end
39
-
40
- AXML::El = Struct.new(:parent, :name, :attrs, :text, :children, :array_index)
41
-
42
- class AXML::El
43
- include Enumerable
44
-
45
- # use AXML::El::Indent.replace to swap without warning
46
- # ["", " ", " ", " ", " ", " ", ... ]
47
- Indent = ' '
48
- # use AXML::El::Indentation.replace to replace w/o warning
49
- Indentation = (0...30).to_a.map {|num| Indent*num }
50
-
51
- # current depth
52
- @@depth = 0
53
-
54
- alias_method :content, :text
55
- alias_method :content=, :text=
56
- alias_method :kids, :children
57
- alias_method :kids=, :children=
58
-
59
- def [](attribute_string)
60
- attrs[attribute_string]
61
- end
62
-
63
- def []=(attribute_string, value)
64
- attrs[attribute_string] = value
65
- end
66
-
67
- # has text?
68
- def text?
69
- !!text
70
- end
71
-
72
- def children?
73
- children.size > 0
74
- end
75
- alias_method :child?, :children?
76
-
77
- # full traversal from the initial node
78
- def traverse(type=:pre, &block)
79
- if type == :pre
80
- block.call(self)
81
- end
82
- children.each do |child|
83
- child.traverse(type, &block)
84
- end
85
- if type == :post
86
- block.call(self)
87
- end
88
- end
89
-
90
- def each(&block)
91
- children.each do |child|
92
- block.call(child)
93
- end
94
- end
95
-
96
- # drops the current element from the list of its parents children
97
- def drop
98
- parent.children.delete(self)
99
- end
100
-
101
- def drop_child(node)
102
- found_it = false
103
- found_index = nil
104
- children.each_with_index do |v,i|
105
- if found_it
106
- v.array_index = i - 1
107
- end
108
- if v.object_id == node.object_id
109
- found_index = i
110
- found_it = true
111
- end
112
- end
113
- children.delete_at(found_index) if found_index
114
- end
115
-
116
- def tabs
117
- Indentation[@@depth]
118
- end
119
-
120
- EscapeCharsRe = /['"&><]/
121
-
122
- # returns data escaped if necessary
123
- def escape(data)
124
- # modified slightly from xmlsimple.rb
125
- return data if !data.is_a?(String) || data.nil? || data == ''
126
- result = data.dup
127
- if EscapeCharsRe.match(data)
128
- result.gsub!('&', '&amp;')
129
- result.gsub!('<', '&lt;')
130
- result.gsub!('>', '&gt;')
131
- result.gsub!('"', '&quot;')
132
- result.gsub!("'", '&apos;')
133
- end
134
- result
135
- end
136
-
137
- def to_s
138
- attstring = ""
139
- if attrs.size > 0
140
- attstring = " " + attrs.collect { |k,v| "#{k}=\"#{escape(v)}\"" }.join(" ")
141
- end
142
- string = "#{tabs}<#{name}#{attstring}"
143
- if children.size > 0
144
- string << ">"
145
- if text?
146
- string << escape(text)
147
- end
148
- string << "\n"
149
- @@depth += 1
150
- string << children.collect {|child| child.to_s }.join("")
151
- @@depth -= 1
152
- string << "#{tabs}</#{name}>\n"
153
- elsif text?
154
- string << ">" << escape(text) << "</#{name}>\n"
155
- else
156
- string << "/>\n"
157
- end
158
- string
159
- end
160
-
161
- def inspect
162
- "<name='#{name}' attrs='#{attrs.inspect}' children.size=#{children.size}>"
163
- end
164
-
165
- # the next node
166
- def next
167
- parent.children[array_index+1]
168
- end
169
-
170
- # the first child (equivalent to children.first)
171
- def child
172
- children.first
173
- end
174
-
175
- def add_node(node)
176
- node.array_index = children.size
177
- children.push( node )
178
- end
179
-
180
- ########################################################################
181
- # FIND and FIND_FIRST (with a little useful xpath)
182
- ########################################################################
183
-
184
- # Returns an array of nodes. Accepts same xpath strings as find_first.
185
- def find(string)
186
- (tp, name) = string.split('::')
187
- case tp
188
- when 'child'
189
- find_children(name)
190
- when 'descendant'
191
- find_descendants(name)
192
- when 'following-sibling'
193
- find_following_siblings(name)
194
- end
195
- end
196
-
197
- # currently must be called with descendant:: or child:: string prefix! e.g.
198
- # "descendant::<name>" and "child::<name>" where <name> is the name of the
199
- # node you seek)
200
- def find_first(string)
201
- (tp, name) = string.split('::')
202
- case tp
203
- when 'child'
204
- find_first_child(name)
205
- when 'descendant'
206
- find_first_descendant(name)
207
- when 'following-sibling'
208
- find_first_following_sibling(name)
209
- end
210
- end
211
-
212
- def find_descendants(name, collect_descendants=[])
213
- children.each do |child|
214
- collect_descendants.push(child) if child.name == name
215
- child.find_descendants(name, collect_descendants)
216
- end
217
- collect_descendants
218
- end
219
-
220
- def find_first_descendant(name)
221
- self.each do |child_node|
222
- if child_node.name == name
223
- return child_node
2
+ require 'axml/autoload'
3
+
4
+ module AXML
5
+ # note that if Autoload must find a suitable parser, it will be set as the
6
+ # :parser default to be used for future reference.
7
+ DEFAULTS = {:keep_blanks => false, :parser => nil}
8
+ PREFERRED = [:xmlparser, :libxml, :libxml_sax, :rexml]
9
+ CLASS_MAPPINGS = {:xmlparser => 'XMLParser', :libxml => 'LibXML', :libxml_sax => 'libXMLSax', :rexml => 'REXML' }
10
+ WARN = {:rexml => "Using REXML as parser! This is very slow on large docs!\nCall the method AXML::Autoload.install_instructions for help installing\nsomething FASTER!",
11
+ }
12
+
13
+ # opts:
14
+ # :parser =
15
+ def parse(arg, opts={})
16
+ opts = DEFAULTS.merge opts
17
+ parser = AXML::Autoload.parser!(opts[:parser])
18
+ method =
19
+ if arg.is_a?(String) && File.exist?(arg)
20
+ :parse_file
21
+ elsif arg.is_a?(IO)
22
+ :parse_io
23
+ elsif arg.is_a?(String)
24
+ :parse_string
224
25
  else
225
- return child_node.find_first_descendant(name)
26
+ raise ArgumentError, "can deal with filenames, Strings, and IO objects.\nDon't know how to work with object of class: #{arg.class}"
226
27
  end
227
- end
228
- return nil
28
+ parser.send(method, arg, opts)
229
29
  end
230
30
 
231
- def find_children(name)
232
- children.select {|v| v.name == name }
31
+ def parse_file(file, opts={}) # :nodoc:
32
+ opts = DEFAULTS.merge opts
33
+ parser = AXML::Autoload.parser!(opts[:parser])
34
+ File.open(file) {|fh| parser.parse_io(fh, opts) }
233
35
  end
234
36
 
235
- def find_first_child(name)
236
- self.each do |child_node|
237
- if child_node.name == name
238
- return child_node
239
- end
240
- end
241
- return nil
242
- end
243
-
244
- def find_following_siblings(name)
245
- parent.children[(array_index+1)..-1].select {|v| v.name == name }
246
- end
247
-
248
- def find_first_following_sibling(name)
249
- node = nil
250
- parent.children[(array_index+1)..-1].each do |sibling|
251
- if sibling.name == name
252
- node = sibling
253
- break
254
- end
255
- end
256
- node
257
- end
37
+ extend AXML
258
38
 
259
39
  end
260
-
261
- class AXML::XMLParser < XMLParser
262
-
263
- attr_writer :root
264
-
265
- # returns the first node found in the document
266
- def root
267
- @root.child
268
- end
269
-
270
- def set_no_keep_blanks
271
- instance_eval do
272
- def endElement(name)
273
- unless AXML::NotBlankText_re.match(@cur.text)
274
- @cur.text = nil
275
- end
276
- @cur = @cur.parent
277
- end
278
- end
279
- end
280
-
281
- # returns text as an array for each occurence of the specified element: [start_index, num_bytes]
282
- def set_single_text_indices(el_name)
283
- @el_name = el_name
284
- instance_eval do
285
- def startElement(name, attributes)
286
- text =
287
- if name == @el_name ; []
288
- else ; ''
289
- end
290
- new_el = ::AXML::El.new(@cur, name, attributes, text, [])
291
- @cur.add_node(new_el)
292
- @cur = new_el
293
- end
294
-
295
- def character(data)
296
- if @cur.text.is_a? Array
297
- @cur.text << byteIndex
298
- else
299
- @cur.text << data
300
- end
301
- end
302
-
303
- def endElement(name)
304
- if @cur.text.is_a? Array
305
- @cur.text << (byteIndex - @cur.text.first)
306
- end
307
- @cur = @cur.parent
308
- end
309
- end
310
- end
311
-
312
- # takes opts from AXML::parse method
313
- def initialize
314
- #@keep_blanks = opts[:keep_blanks]
315
- @root = AXML::El.new(nil, "root", {}, '', [])
316
- @cur = @root
317
- end
318
-
319
- def startElement(name, attributes)
320
- new_el = AXML::El.new(@cur, name, attributes, '', [])
321
- @cur.add_node(new_el)
322
- @cur = new_el
323
- end
324
-
325
- def character(data)
326
- @cur.text << data
327
- end
328
-
329
- def endElement(name)
330
- @cur = @cur.parent
331
- end
332
-
333
-
334
- end
335
-
336
-
337
-
338
- =begin
339
-
340
- # This parser stores information about where the peaks information is in the
341
- # file
342
- # The content of the peaks node is an array where the first member is the
343
- # start index and the last member is the number of bytes. All other members
344
- # should be ignored.
345
- class AXML::XMLParser::LazyPeaks < ::AXML::XMLParser
346
-
347
- def startElement(name, attributes)
348
- text =
349
- if name == 'peaks' ; []
350
- else ; ''
351
- end
352
- new_el = ::AXML::El.new(@cur, name, attributes, text, [])
353
- # add the new node to the previous parent node
354
- @cur.add_node(new_el)
355
- # notice the change in @cur node
356
- @cur = new_el
357
- end
358
-
359
- def character(data)
360
- if @cur.text.is_a? Array
361
- @cur.text << byteIndex
362
- else
363
- @cur.text << data
364
- end
365
- end
366
-
367
- def endElement(name)
368
- if @cur.text.is_a? Array
369
- @cur.text << (byteIndex - @cur.text.first)
370
- end
371
- @cur = @cur.parent
372
- end
373
-
374
- end
375
-
376
- =end