axml 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/README CHANGED
@@ -1,30 +1,27 @@
1
1
  AXML
2
2
  ====
3
3
 
4
- AXML - Provides a simple DOM for working with XML (using XMLParser under the
5
- hood) that can serve as a drop in replacement for a subset of basic libxml
6
- functionality (e.g., each, children, child, find_first, find, next).
4
+ AXML - Provides a simple, minimalistic DOM for working with data stored in an
5
+ XML document. The API is very similar to LibXML, differing slightly in the
6
+ handling of text nodes. It is designed with very large documents in mind: nodes are represented in memory efficient Struct objects and it works with either XMLParser or LibXML!
7
7
 
8
- 'AXML' means 'ax XML' which succinctly describes the occasional feeling of a
9
- programmer towards XML or its myriad parsers. AXML won't solve all your
10
- problems, but it does make working with XML much less painful.
8
+ 'AXML' literally translates into 'ax XML' which succinctly describes the
9
+ occasional feeling of a programmer towards XML or its myriad parsers. AXML
10
+ won't solve all your XML woes, but it does make working with XML much less
11
+ painful.
11
12
 
12
13
  Features
13
14
  --------
14
15
 
15
- * *fast*: it's implemented in XMLParser (expat under the hood)
16
- * *lean*: as in 'lines of code' (~220 w/ blank lines) and as in 'memory consumption' (nodes implemented as Struct, children in Array)
16
+ * *fast*: runs on either XMLParser or LibXML
17
+ * *lean*: as in 'lines of code' and as in 'memory consumption' (nodes implemented as Struct, children in Array)
17
18
  * *easy to extend*: code your Grandmother could read and understand (if she reads ruby)
18
- * *quacks like libxml*: implements a very useful subset of libxml methods for near drop in replacement.
19
-
19
+ * PLOS: implements a useful subset of libxml methods for near drop in replacement.
20
20
 
21
21
  Examples
22
22
  --------
23
23
 
24
- require 'axml' # currently requires 'xmlparser' be installed
25
- # Windows: already in one-click-installer
26
- # Ubuntu: sudo apt-get install libxml-parser-ruby1.8
27
- # Cygwin: see http://mspire.rubyforge.org/tutorial/cygwin_mspire.html
24
+ require 'axml'
28
25
 
29
26
  # a little example xml string to use
30
27
  string_or_io = "
@@ -39,18 +36,24 @@ Examples
39
36
  </n1>
40
37
  "
41
38
 
42
- ### Read a string or io
39
+ ### Read a string, io, or file
43
40
 
44
41
  n1_node = AXML.parse(string_or_io)
45
-
46
- ### Read a file
47
-
48
- n1_node = AXML.parse_file('path/to/file')
42
+ # --or--
43
+ n1_node = AXML.parse('path/to/file')
49
44
 
50
45
  ### Access children
51
46
 
52
47
  n1_node.children # -> [array]
53
- n1_node.each {|child| # do something with child }
48
+ n1_node.each {|child| # do something with each child }
49
+
50
+ ### Traverse the whole tree structure
51
+
52
+ n1_node.traverse do |node|
53
+ # pre traversal
54
+ end
55
+
56
+ n1_node.traverse(:post) {|node| # post traversal }
54
57
 
55
58
  ### Get attributes and text
56
59
 
@@ -59,7 +62,7 @@ Examples
59
62
  n3_node.text # -> 'words here'
60
63
  n3_node.content # -> [same]
61
64
 
62
- ### Traverse nodes with next and child
65
+ ### Navigate nodes
63
66
 
64
67
  n2_node = n1_node.child
65
68
  the_other_n2_node = n2_node.next
@@ -71,26 +74,12 @@ Examples
71
74
  n3_node = n1_node.find_first('descendant::n3')
72
75
  other_n3_node = n3_node.find_first('following-sibling::n3')
73
76
  n1_node.find_first('child::n3') # -> nil
77
+ # also callable as find_first_child and find_first_descendant
74
78
 
75
79
  # find (returns an array)
76
- n1_node.find('descendant::n3') # -> [array of all 3 <n3> nodes]
77
80
  n1_node.find('child::n2') # -> [array of 2 <n2> nodes]
78
-
79
- ### Switch to libxml
80
-
81
- This is all it takes to get all of the above code to work under libxml:
82
-
83
- require 'xml/libxml' # instead of: require 'axml'
84
-
85
- # A file
86
- REPLACE: n1_node = AXML.parse_file(file)
87
- WITH: n1_node = XML::Document.file(file).root # note the .root call on the end!
88
-
89
- # A string
90
- REPLACE: n1_node = AXML.parse(string)
91
- WITH: n1_node = XML::Parser.string(string).parse.root # note the .root call on the end!
92
-
93
- Wallah! All the above method calls work under libxml
81
+ n1_node.find('descendant::n3') # -> [array of all 3 <n3> nodes]
82
+ # also callable as find_child and find_descendant
94
83
 
95
84
 
96
85
  See `specs/axml_spec.rb` for more examples and functionality
@@ -107,3 +96,10 @@ Installation
107
96
  ------------
108
97
 
109
98
  gem install axml
99
+
100
+ See Also
101
+ --------
102
+
103
+ If you are parsing HTML or complex word processing documents this is not the parser for you. Try something like hpricot or LibXML.
104
+
105
+
data/Rakefile CHANGED
@@ -2,9 +2,9 @@ require 'rake'
2
2
  require 'rubygems'
3
3
  require 'rake/rdoctask'
4
4
  require 'rake/gempackagetask'
5
+ require 'rake/testtask'
5
6
  require 'rake/clean'
6
7
  require 'fileutils'
7
- #require 'spec/rake/spectask'
8
8
  require 'email_encrypt'
9
9
 
10
10
  ###############################################
@@ -59,7 +59,7 @@ task :html_docs do
59
59
 
60
60
  # add contact info:
61
61
  index.puts '<h2>Contact</h2>'
62
- index.puts 'jprince@icmb.utexas.edu'.email_encrypt
62
+ index.puts 'jtprince@gmail.com'.email_encrypt
63
63
 
64
64
  index.puts '</body></html>'
65
65
  end
@@ -75,64 +75,20 @@ end
75
75
  # TESTS
76
76
  ###############################################
77
77
 
78
-
79
- task :ensure_gem_is_uninstalled do
80
- reply = `#{$gemcmd} list -l #{NAME}`
81
- if reply.include? NAME + " ("
82
- puts "GOING to uninstall gem '#{NAME}' for testing"
83
- if WIN32
84
- %x( #{$gemcmd} uninstall -x #{NAME} )
85
- else
86
- %x( sudo #{$gemcmd} uninstall -x #{NAME} )
87
- end
88
- end
78
+ desc 'Default: Run specs.'
79
+ task :default => :spec
80
+
81
+ desc 'Run specs.'
82
+ Rake::TestTask.new(:spec) do |t|
83
+ t.verbose = true
84
+ t.warning = true
85
+ ENV['RUBYOPT'] = 'rubygems'
86
+ ENV['TEST'] = ENV['SPEC'] if ENV['SPEC']
87
+ t.libs = ['lib']
88
+ t.test_files = Dir.glob( File.join('spec', ENV['pattern'] || '**/*_spec.rb') )
89
+ t.options = "-v"
89
90
  end
90
91
 
91
- #namespace :spec do
92
- # task :autotest do
93
- # require './specs/rspec_autotest'
94
- # RspecAutotest.run
95
- # end
96
- #end
97
-
98
- #desc "Run specs"
99
- #Spec::Rake::SpecTask.new('spec') do |t|
100
- # Rake::Task[:ensure_gem_is_uninstalled].invoke
101
- # t.libs = ['lib']
102
- # t.spec_files = FileList['specs/**/*_spec.rb']
103
- #end
104
-
105
- #desc "Run specs and output specdoc"
106
- #Spec::Rake::SpecTask.new('specl') do |t|
107
- # Rake::Task[:ensure_gem_is_uninstalled].invoke
108
- # t.spec_files = FileList['specs/**/*_spec.rb']
109
- # t.libs = ['lib']
110
- # t.spec_opts = ['--format', 'specdoc' ]
111
- #end
112
-
113
- #desc "Run all specs with RCov"
114
- #Spec::Rake::SpecTask.new('rcov') do |t|
115
- # Rake::Task[:ensure_gem_is_uninstalled].invoke
116
- # t.spec_files = FileList['specs/**/*_spec.rb']
117
- # t.rcov = true
118
- # t.libs = ['lib']
119
- # t.rcov_opts = ['--exclude', 'specs']
120
- #end
121
-
122
- #task :spec do
123
- # uninstall_gem
124
- # # files that match a key word
125
- # files_to_run = ENV['SPEC'] || FileList['specs/**/*_spec.rb']
126
- # if ENV['SPECM']
127
- # files_to_run = files_to_run.select do |file|
128
- # file.include?(ENV['SPECM'])
129
- # end
130
- # end
131
- # files_to_run.each do |spc|
132
- # system "ruby -I lib -S spec #{spc} --format specdoc"
133
- # end
134
- #end
135
-
136
92
  ###############################################
137
93
  # PACKAGE / INSTALL / UNINSTALL
138
94
  ###############################################
@@ -191,9 +147,11 @@ gemspec = Gem::Specification.new do |t|
191
147
  t.platform = Gem::Platform::RUBY
192
148
  t.name = NAME
193
149
  t.version = IO.readlines(changelog).grep(/##.*version/).pop.split(/\s+/).last.chomp
150
+ t.homepage = 'http://axml.rubyforge.org/'
151
+ t.rubyforge_project = 'axml'
194
152
  t.summary = summary
195
153
  t.date = "#{tm.year}-#{tm.month}-#{tm.day}"
196
- t.email = "jprince@icmb.utexas.edu"
154
+ t.email = "jtprince@gmail.com"
197
155
  t.description = description
198
156
  t.has_rdoc = true
199
157
  t.authors = ["John Prince"]
@@ -201,8 +159,8 @@ gemspec = Gem::Specification.new do |t|
201
159
  t.rdoc_options = rdoc_options
202
160
  t.extra_rdoc_files = rdoc_extra_includes
203
161
  t.executables = FL["bin/*"].map {|file| File.basename(file) }
204
- t.requirements << 'xmlparser is needed right now'
205
- t.test_files = FL["specs/*_spec.rb"]
162
+ t.requirements << 'xmlparser or libxml'
163
+ t.test_files = FL["spec/**/*_spec.rb"]
206
164
  end
207
165
 
208
166
  desc "Create packages."
data/lib/axml.rb CHANGED
@@ -1,376 +1,39 @@
1
- require 'xmlparser'
2
1
 
3
- class AXML
4
-
5
- NotBlankText_re = /[^\s+]+/m
6
-
7
- def self.parse_file(file)
8
- root = nil
9
- File.open(file) {|fh| root = parse(fh) }
10
- root
11
- end
12
-
13
- # Returns the root node (as Element) or nodes (as Array)
14
- # options:
15
- # :keep_blanks => *true | false
16
- def self.parse(stream, opts={:keep_blanks => false})
17
- parser = AXML::XMLParser.new
18
- if opts[:keep_blanks] == false
19
- parser.set_no_keep_blanks
20
- end
21
- if ti = opts[:text_indices]
22
- if ti.is_a?(Array) && ti.size > 1
23
- raise NotImplementedError, "currently only supports a single element"
24
- else
25
- ti =
26
- if ti.is_a?(Array)
27
- ti.first.to_s
28
- else
29
- ti.to_s
30
- end
31
- parser.set_single_text_indices(ti)
32
- end
33
- end
34
- parser.parse(stream)
35
- parser.root
36
- end
37
-
38
- end
39
-
40
- AXML::El = Struct.new(:parent, :name, :attrs, :text, :children, :array_index)
41
-
42
- class AXML::El
43
- include Enumerable
44
-
45
- # use AXML::El::Indent.replace to swap without warning
46
- # ["", " ", " ", " ", " ", " ", ... ]
47
- Indent = ' '
48
- # use AXML::El::Indentation.replace to replace w/o warning
49
- Indentation = (0...30).to_a.map {|num| Indent*num }
50
-
51
- # current depth
52
- @@depth = 0
53
-
54
- alias_method :content, :text
55
- alias_method :content=, :text=
56
- alias_method :kids, :children
57
- alias_method :kids=, :children=
58
-
59
- def [](attribute_string)
60
- attrs[attribute_string]
61
- end
62
-
63
- def []=(attribute_string, value)
64
- attrs[attribute_string] = value
65
- end
66
-
67
- # has text?
68
- def text?
69
- !!text
70
- end
71
-
72
- def children?
73
- children.size > 0
74
- end
75
- alias_method :child?, :children?
76
-
77
- # full traversal from the initial node
78
- def traverse(type=:pre, &block)
79
- if type == :pre
80
- block.call(self)
81
- end
82
- children.each do |child|
83
- child.traverse(type, &block)
84
- end
85
- if type == :post
86
- block.call(self)
87
- end
88
- end
89
-
90
- def each(&block)
91
- children.each do |child|
92
- block.call(child)
93
- end
94
- end
95
-
96
- # drops the current element from the list of its parents children
97
- def drop
98
- parent.children.delete(self)
99
- end
100
-
101
- def drop_child(node)
102
- found_it = false
103
- found_index = nil
104
- children.each_with_index do |v,i|
105
- if found_it
106
- v.array_index = i - 1
107
- end
108
- if v.object_id == node.object_id
109
- found_index = i
110
- found_it = true
111
- end
112
- end
113
- children.delete_at(found_index) if found_index
114
- end
115
-
116
- def tabs
117
- Indentation[@@depth]
118
- end
119
-
120
- EscapeCharsRe = /['"&><]/
121
-
122
- # returns data escaped if necessary
123
- def escape(data)
124
- # modified slightly from xmlsimple.rb
125
- return data if !data.is_a?(String) || data.nil? || data == ''
126
- result = data.dup
127
- if EscapeCharsRe.match(data)
128
- result.gsub!('&', '&amp;')
129
- result.gsub!('<', '&lt;')
130
- result.gsub!('>', '&gt;')
131
- result.gsub!('"', '&quot;')
132
- result.gsub!("'", '&apos;')
133
- end
134
- result
135
- end
136
-
137
- def to_s
138
- attstring = ""
139
- if attrs.size > 0
140
- attstring = " " + attrs.collect { |k,v| "#{k}=\"#{escape(v)}\"" }.join(" ")
141
- end
142
- string = "#{tabs}<#{name}#{attstring}"
143
- if children.size > 0
144
- string << ">"
145
- if text?
146
- string << escape(text)
147
- end
148
- string << "\n"
149
- @@depth += 1
150
- string << children.collect {|child| child.to_s }.join("")
151
- @@depth -= 1
152
- string << "#{tabs}</#{name}>\n"
153
- elsif text?
154
- string << ">" << escape(text) << "</#{name}>\n"
155
- else
156
- string << "/>\n"
157
- end
158
- string
159
- end
160
-
161
- def inspect
162
- "<name='#{name}' attrs='#{attrs.inspect}' children.size=#{children.size}>"
163
- end
164
-
165
- # the next node
166
- def next
167
- parent.children[array_index+1]
168
- end
169
-
170
- # the first child (equivalent to children.first)
171
- def child
172
- children.first
173
- end
174
-
175
- def add_node(node)
176
- node.array_index = children.size
177
- children.push( node )
178
- end
179
-
180
- ########################################################################
181
- # FIND and FIND_FIRST (with a little useful xpath)
182
- ########################################################################
183
-
184
- # Returns an array of nodes. Accepts same xpath strings as find_first.
185
- def find(string)
186
- (tp, name) = string.split('::')
187
- case tp
188
- when 'child'
189
- find_children(name)
190
- when 'descendant'
191
- find_descendants(name)
192
- when 'following-sibling'
193
- find_following_siblings(name)
194
- end
195
- end
196
-
197
- # currently must be called with descendant:: or child:: string prefix! e.g.
198
- # "descendant::<name>" and "child::<name>" where <name> is the name of the
199
- # node you seek)
200
- def find_first(string)
201
- (tp, name) = string.split('::')
202
- case tp
203
- when 'child'
204
- find_first_child(name)
205
- when 'descendant'
206
- find_first_descendant(name)
207
- when 'following-sibling'
208
- find_first_following_sibling(name)
209
- end
210
- end
211
-
212
- def find_descendants(name, collect_descendants=[])
213
- children.each do |child|
214
- collect_descendants.push(child) if child.name == name
215
- child.find_descendants(name, collect_descendants)
216
- end
217
- collect_descendants
218
- end
219
-
220
- def find_first_descendant(name)
221
- self.each do |child_node|
222
- if child_node.name == name
223
- return child_node
2
+ require 'axml/autoload'
3
+
4
+ module AXML
5
+ # note that if Autoload must find a suitable parser, it will be set as the
6
+ # :parser default to be used for future reference.
7
+ DEFAULTS = {:keep_blanks => false, :parser => nil}
8
+ PREFERRED = [:xmlparser, :libxml, :libxml_sax, :rexml]
9
+ CLASS_MAPPINGS = {:xmlparser => 'XMLParser', :libxml => 'LibXML', :libxml_sax => 'libXMLSax', :rexml => 'REXML' }
10
+ WARN = {:rexml => "Using REXML as parser! This is very slow on large docs!\nCall the method AXML::Autoload.install_instructions for help installing\nsomething FASTER!",
11
+ }
12
+
13
+ # opts:
14
+ # :parser =
15
+ def parse(arg, opts={})
16
+ opts = DEFAULTS.merge opts
17
+ parser = AXML::Autoload.parser!(opts[:parser])
18
+ method =
19
+ if arg.is_a?(String) && File.exist?(arg)
20
+ :parse_file
21
+ elsif arg.is_a?(IO)
22
+ :parse_io
23
+ elsif arg.is_a?(String)
24
+ :parse_string
224
25
  else
225
- return child_node.find_first_descendant(name)
26
+ raise ArgumentError, "can deal with filenames, Strings, and IO objects.\nDon't know how to work with object of class: #{arg.class}"
226
27
  end
227
- end
228
- return nil
28
+ parser.send(method, arg, opts)
229
29
  end
230
30
 
231
- def find_children(name)
232
- children.select {|v| v.name == name }
31
+ def parse_file(file, opts={}) # :nodoc:
32
+ opts = DEFAULTS.merge opts
33
+ parser = AXML::Autoload.parser!(opts[:parser])
34
+ File.open(file) {|fh| parser.parse_io(fh, opts) }
233
35
  end
234
36
 
235
- def find_first_child(name)
236
- self.each do |child_node|
237
- if child_node.name == name
238
- return child_node
239
- end
240
- end
241
- return nil
242
- end
243
-
244
- def find_following_siblings(name)
245
- parent.children[(array_index+1)..-1].select {|v| v.name == name }
246
- end
247
-
248
- def find_first_following_sibling(name)
249
- node = nil
250
- parent.children[(array_index+1)..-1].each do |sibling|
251
- if sibling.name == name
252
- node = sibling
253
- break
254
- end
255
- end
256
- node
257
- end
37
+ extend AXML
258
38
 
259
39
  end
260
-
261
- class AXML::XMLParser < XMLParser
262
-
263
- attr_writer :root
264
-
265
- # returns the first node found in the document
266
- def root
267
- @root.child
268
- end
269
-
270
- def set_no_keep_blanks
271
- instance_eval do
272
- def endElement(name)
273
- unless AXML::NotBlankText_re.match(@cur.text)
274
- @cur.text = nil
275
- end
276
- @cur = @cur.parent
277
- end
278
- end
279
- end
280
-
281
- # returns text as an array for each occurence of the specified element: [start_index, num_bytes]
282
- def set_single_text_indices(el_name)
283
- @el_name = el_name
284
- instance_eval do
285
- def startElement(name, attributes)
286
- text =
287
- if name == @el_name ; []
288
- else ; ''
289
- end
290
- new_el = ::AXML::El.new(@cur, name, attributes, text, [])
291
- @cur.add_node(new_el)
292
- @cur = new_el
293
- end
294
-
295
- def character(data)
296
- if @cur.text.is_a? Array
297
- @cur.text << byteIndex
298
- else
299
- @cur.text << data
300
- end
301
- end
302
-
303
- def endElement(name)
304
- if @cur.text.is_a? Array
305
- @cur.text << (byteIndex - @cur.text.first)
306
- end
307
- @cur = @cur.parent
308
- end
309
- end
310
- end
311
-
312
- # takes opts from AXML::parse method
313
- def initialize
314
- #@keep_blanks = opts[:keep_blanks]
315
- @root = AXML::El.new(nil, "root", {}, '', [])
316
- @cur = @root
317
- end
318
-
319
- def startElement(name, attributes)
320
- new_el = AXML::El.new(@cur, name, attributes, '', [])
321
- @cur.add_node(new_el)
322
- @cur = new_el
323
- end
324
-
325
- def character(data)
326
- @cur.text << data
327
- end
328
-
329
- def endElement(name)
330
- @cur = @cur.parent
331
- end
332
-
333
-
334
- end
335
-
336
-
337
-
338
- =begin
339
-
340
- # This parser stores information about where the peaks information is in the
341
- # file
342
- # The content of the peaks node is an array where the first member is the
343
- # start index and the last member is the number of bytes. All other members
344
- # should be ignored.
345
- class AXML::XMLParser::LazyPeaks < ::AXML::XMLParser
346
-
347
- def startElement(name, attributes)
348
- text =
349
- if name == 'peaks' ; []
350
- else ; ''
351
- end
352
- new_el = ::AXML::El.new(@cur, name, attributes, text, [])
353
- # add the new node to the previous parent node
354
- @cur.add_node(new_el)
355
- # notice the change in @cur node
356
- @cur = new_el
357
- end
358
-
359
- def character(data)
360
- if @cur.text.is_a? Array
361
- @cur.text << byteIndex
362
- else
363
- @cur.text << data
364
- end
365
- end
366
-
367
- def endElement(name)
368
- if @cur.text.is_a? Array
369
- @cur.text << (byteIndex - @cur.text.first)
370
- end
371
- @cur = @cur.parent
372
- end
373
-
374
- end
375
-
376
- =end