uformatparser 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (5) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +44 -0
  3. data/Rakefile +74 -0
  4. data/lib/uformatparser.rb +731 -0
  5. metadata +50 -0
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2005 Assaf Arkin
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,44 @@
1
+ = Microformat Parser
2
+
3
+ MicroformatParser is a Ruby module for creating microformat parsers.
4
+ A microformat parser is a class with a set of rules for extracting
5
+ interesting content from (X)HTML documents. You create your own parser
6
+ by writing a class with a set of rules. The magic happens in the parse
7
+ method which taks an (X)HTML document or element, runs all the rules
8
+ on it, and returns new object that holds the extracted valus.
9
+
10
+ Here's a simple example to find all links and all tags in a document:
11
+
12
+ class MyParser
13
+ include MicroformatParser
14
+
15
+ rule :links, "a", "a@href"
16
+ rule :tags, "a[rel~=tag]", "text()"
17
+ end
18
+
19
+ content = MyParser.parse(doc)
20
+ puts "Found " + content.links.size + " links" if content.links
21
+ puts "Tagged with " + content.tags.join(', ') if content.tags
22
+
23
+
24
+ == Documentation
25
+
26
+ You may want to read the documentation for a more details discussion of
27
+ selectors, extractors, compound rules, (X)HTML parsing and examples
28
+
29
+ http://trac.labnotes.org/cgi-bin/trac.cgi/wiki/Ruby/MicroformatParser
30
+
31
+
32
+ == Download
33
+
34
+ The latest version of can be found at
35
+
36
+ http://rubyforge.org/projects/uformatparser/
37
+
38
+ == License
39
+
40
+ This package is licensed under the MIT license and/or the {Creative
41
+ Commons Attribution-ShareAlike}[http://creativecommons.org/licenses/by-sa/2.5/legalcode].
42
+
43
+ :include: MIT-LICENSE
44
+
@@ -0,0 +1,74 @@
1
+ # Adapted from the rake Rakefile.
2
+
3
+ require 'rubygems'
4
+ Gem::manage_gems
5
+ require 'rake/testtask'
6
+ require 'rake/rdoctask'
7
+ require 'rake/gempackagetask'
8
+
9
+
10
+ desc "Default Task"
11
+ task :default => [:tests, :rdoc]
12
+
13
+
14
+ Rake::TestTask.new :tests do |test|
15
+ test.verbose = true
16
+ test.test_files = ['test/*.rb']
17
+ end
18
+
19
+
20
+ # Create the documentation.
21
+ Rake::RDocTask.new do |rdoc|
22
+ rdoc.main = "README"
23
+ rdoc.rdoc_files.include("README", "lib/**/*.rb")
24
+ rdoc.title = 'Microformat Parser'
25
+ end
26
+
27
+
28
+ # Create the GEM package.
29
+ gem_spec = Gem::Specification.new do |spec|
30
+ spec.name = 'uformatparser'
31
+ spec.version = "1.0.0"
32
+ spec.summary = "Microformat parser for extracting microcontent from (X)HTML"
33
+ spec.description = <<-EOF
34
+ Parser for extracting microcontent from (X)HTML documents, in any number
35
+ of microformats.
36
+
37
+ Uses a DSL for specifying the parsing rules as a set of selectors and
38
+ extractors. Supports a CSS-like selector and XPath-like extractor syntaxes
39
+ for quick and easy rule writing. Also supports reusable and compound rules,
40
+ EOF
41
+ spec.author = "Assaf Arkin"
42
+ spec.email = "assaf@labnotes.org"
43
+ spec.homepage = "http://trac.labnotes.org/cgi-bin/trac.cgi/wiki/Ruby/MicroformatParser"
44
+
45
+ spec.files = FileList["{test,lib}/**/*", "README", "Rakefile", "MIT-LICENSE"].to_a
46
+ spec.require_path = "lib"
47
+ spec.autorequire = 'uformatparser.rb'
48
+ spec.requirements << "ReXML. HTML->ReXML parser."
49
+ spec.has_rdoc = true
50
+ spec.rdoc_options << '--main' << 'README' << '--title' << 'Microformat parser' << '--line-numbers'
51
+ spec.extra_rdoc_files = ["README"]
52
+ spec.rubyforge_project = "uformatparser"
53
+ end
54
+
55
+ gem = Rake::GemPackageTask.new(gem_spec) do |pkg|
56
+ pkg.need_tar = true
57
+ pkg.need_zip = true
58
+ end
59
+
60
+
61
+ # --------------------------------------------------------------------
62
+ # Creating a release
63
+
64
+ desc "Make a new release"
65
+ task :release => [:tests, :clobber, :package] do
66
+ puts
67
+ puts "**************************************************************"
68
+ puts "* Release #{gem_spec.version} Complete."
69
+ puts "* Packages ready to upload."
70
+ puts "**************************************************************"
71
+ puts
72
+ end
73
+
74
+
@@ -0,0 +1,731 @@
1
+ #
2
+ # = uformatparser.rb - Microformat parser
3
+ #
4
+ #--
5
+ # Author:: Assaf Arkin assaf@labnotes.org
6
+ # Documentation:: http://trac.labnotes.org/cgi-bin/trac.cgi/wiki/Ruby/MicroformatParser
7
+ # Copyright:: Copyright (c) 2005 Assaf Arkin
8
+ # License:: Creative Commons Attribution-ShareAlike
9
+ #
10
+ #++
11
+
12
+ require 'rexml/document'
13
+
14
+
15
+ # Implements a microformat parser by extending a class that includes this module.
16
+ #
17
+ # === The Basics
18
+ #
19
+ # To create a microformat parser, extend a class with this module and use the
20
+ # +rule+ method to define parsing rules for that class. Call +parse+ to parse the
21
+ # content, returning a new instance of the class holding all values extracted from
22
+ # parsing. You can parse a document or an element.
23
+ #
24
+ # For example:
25
+ # class Microformats
26
+ # include MicroformatParser
27
+ #
28
+ # class HCalendar
29
+ # include MicroformatParser
30
+ #
31
+ # # Extract ISO date/time
32
+ # extractor :dt_extractor do |node|
33
+ # value = node.attributes['title'] if node.name == 'abbr'
34
+ # value = text(node) unless value
35
+ # value ? Time.parse(value) : nil
36
+ # end
37
+ #
38
+ # rule_1 :dtstart, nil, :dt_extractor
39
+ # rule_1 :dtend, nil, :dt_extractor
40
+ # rule_1 :summary, nil, :text
41
+ # rule_1 :description, nil, :xml
42
+ # rule_1 :url, nil, "a@href"
43
+ # end
44
+ #
45
+ # rule :tags, "a[rel~=tag]", "text()"
46
+ # rule :events, ".vevent", HCalendar
47
+ # end
48
+ #
49
+ # content = Microformats.parse(doc)
50
+ # puts content.tags
51
+ # puts content.events
52
+ #
53
+ module MicroformatParser
54
+
55
+
56
+ # Create a new rule.
57
+ #
58
+ # There are two ways to define a rule:
59
+ # * rule name, selector?, extractor?, limit?
60
+ # * rule name, limit? { block }
61
+ #
62
+ # The +name+ argument specifies an instance variable that holds the value
63
+ # (or values) extracted from processing this rule. It can be a string or
64
+ # a symbol. An attribute accessor is created with that name.
65
+ #
66
+ # The +selector+ argument identifies all nodes that match the rule. It can
67
+ # be an CSS-style selector (string) or a method/proc. A symbol specifies
68
+ # a method to use from this class. The method/proc receives a single argument
69
+ # with the node and must return true/false.
70
+ #
71
+ # If selector is absent, the default selector will match any element with
72
+ # a class of the same name as the name argument. For example:
73
+ # rule :dtstart
74
+ # Matches all elements with the class _dtstart_.
75
+ #
76
+ # The +extractor+ argument specifies how to extract a value from a selected
77
+ # node. It can be a list of extract rules (string), a method/proc, or a class.
78
+ # A symbol specifies a method to use from this class. The method/proce receives
79
+ # a single argument with the node and returns the extracted value, or nil.
80
+ #
81
+ # If the extractor is a class, it references a microformat parser which is
82
+ # then called to parse the content of a matching element.
83
+ #
84
+ # If extractor is absent, the default extractor is used:
85
+ # abbr@title|a@href|text()
86
+ #
87
+ # The +limit+ argument specifies the cardinality of the rule's value:
88
+ # 0 The rule is never applied
89
+ # 1 The rule is applied once, the first extracted value is set
90
+ # -1 The rule is applied multiple times, extracted values are set in an array
91
+ # n The rule is applied up to _n_ times, extracted values are set in an array
92
+ #
93
+ # In the second form, a block is specified instead of the selector/extractor.
94
+ # The block is called with a node and returns the extracted value, or nil.
95
+ def rule(name, selector = nil, extractor = nil, limit = -1, &proc)
96
+ raise InvalidRuleException, "First argument (rule name) is required" unless name
97
+ if proc
98
+ # The rule processing is taken from the block, everything else must be nil
99
+ raise InvalidRuleException, "Can't specify selector/extractor in combination with proc" if selector or extractor
100
+ rule = Rule.new(name, nil, proc, limit)
101
+ else
102
+ # Determine the selector.
103
+ selector = case selector
104
+ when NilClass
105
+ # Absent selector: create a selector that matches element with the same
106
+ # class as the rule name
107
+ match = Regexp.new("\\b#{name.to_s}\\b")
108
+ proc { |node| node.attributes['class'] =~ match }
109
+ when String
110
+ # CSS-style selector
111
+ Selector.create(selector)
112
+ when Proc, Method
113
+ # Use as is
114
+ selector
115
+ when Symbol
116
+ # Find named method and use that as the selector
117
+ # Since the instance method is unbound, we bind it to this class
118
+ selector = method(selector)
119
+ raise InvalidSelectorException, "Method #{name.to_s} is not a valid selector" unless selector
120
+ selector
121
+ else
122
+ raise InvalidSelectorException, "Invalid selector type: must be a string, symbol, proc/method or nil"
123
+ end
124
+
125
+ # Determine the extractor
126
+ extractor = case extractor
127
+ when NilClass
128
+ # Absent extractor: either block if provided, otherwise default extractor
129
+ default_extractor
130
+ when String
131
+ # Extractor expression
132
+ Extractor.new(self, extractor)
133
+ when Proc, Method
134
+ # Use as is
135
+ extractor
136
+ when Symbol
137
+ # Find named method and use that as the extractor
138
+ # Since the instance method is unbound, we bind it to this class
139
+ extractor = method(extractor)
140
+ raise InvalidExtractorException, "Method #{name.to_s} is not a valid extractor" unless extractor
141
+ extractor
142
+ when Class
143
+ # Extractor is a class, generally another ruleset, so we call
144
+ # its parse method (must exist).
145
+ begin
146
+ extractor.method(:parse)
147
+ rescue NameError=>error
148
+ raise InvalidExtractorException, "Extractor class must implement the method parse", error.backtrace
149
+ end
150
+ extractor
151
+ else
152
+ raise InvalidExtractorException, "Invalid extractor type: must be a string, parser class, block or nil"
153
+ end
154
+
155
+ # Create a new rule, to invoke its process method
156
+ rule = Rule.new(name, selector, extractor, limit)
157
+ end
158
+
159
+ # Create an accessor for an attribute with the same name as the rule
160
+ # The accessor will hold the rule value
161
+ attr name, true
162
+ # Add this rule to class's ruleset
163
+ self.rules << rule
164
+ end
165
+
166
+
167
+ # Create a new rule that extracts at most one value.
168
+ #
169
+ # Same as calling +rule+ with +limit+=1
170
+ def rule_1(name, selector = nil, extractor = nil, &proc)
171
+ # Rule with limit of one value
172
+ rule(name, selector, extractor, 1, &proc)
173
+ end
174
+
175
+
176
+ # Creates a new selector.
177
+ #
178
+ # There are two ways to create a selector:
179
+ # * selector name, statement
180
+ # * selector name { block }
181
+ #
182
+ # The +name+ argument (a string or symbol) specifies the selector name,
183
+ # defining a class method with that name that can be used to identify matching
184
+ # element.
185
+ #
186
+ # The selector can be a CSS-style selector (string) or a block that accepts a
187
+ # single argument (element) and returns true or false.
188
+ #
189
+ # For example:
190
+ # selector :select_link { |node| node.name == 'a' }
191
+ # extractor :extract_link { |node| node.attributes['href'] }
192
+ # rule :links, :select_link, :extract_link
193
+ def selector(name, selector = nil, &proc)
194
+ raise InvalidSelectorException, "First argument (rule name) is required" unless name
195
+ selector = case selector
196
+ when NilClass
197
+ # Absent selector: either block is provided, or we create a selector
198
+ # that matches element with the same class as the selector name
199
+ if proc
200
+ proc
201
+ else
202
+ match = Regexp.new("\\b#{name.to_s}\\b")
203
+ proc { |node| node.attributes['class'] =~ match }
204
+ end
205
+ when String
206
+ # CSS-style selector
207
+ Selector.create(selector)
208
+ else
209
+ raise InvalidSelectorException, "Invalid selector type: must be a string, block or nil"
210
+ end
211
+ # Create a class method using the selector name that calls the
212
+ # selector's match method.
213
+ class << self
214
+ self
215
+ end.instance_eval { define_method(name) { |node| selector.call(node) } }
216
+ end
217
+
218
+
219
+ # Creates a new extractor.
220
+ #
221
+ # There are two ways to create an extractor:
222
+ # * extractor name, statement
223
+ # * extractor selector name { block }
224
+ #
225
+ # The +name+ argument (string or symbol) specifies the extractor name,
226
+ # defining a class method with that name that can be used to extract the
227
+ # value of a node.
228
+ #
229
+ # The extractor can be an expression (string) or a block that accepts a
230
+ # single argument (element) and returns the extracted value, or nil.
231
+ #
232
+ # For example:
233
+ # selector :select_link { |node| node.name == 'a' }
234
+ # extractor :extract_link { |node| node.attributes['href'] }
235
+ # rule :links, :select_link, :extract_link
236
+ #
237
+ # The expression takes the form of:
238
+ # extractor := extract (|extract)*
239
+ # extract := element | @attribute | element@attribute | method()
240
+ #
241
+ # If multiple extracts are specified, the first extracted value is used.
242
+ #
243
+ # If an element is specified, the text value is extracted only if the selected
244
+ # node is an element of that type. If an attribute is specified, the extracted
245
+ # value is the attribute's value. If both element and attribute are used, the
246
+ # attribute value is extracted only if the selected node is an element of that
247
+ # type.
248
+ #
249
+ # If a method is specified, that method is called for the node. There are two
250
+ # methods available in any class: +text+ and +xml+.
251
+ def extractor(name, extractor = nil, &proc)
252
+ raise InvalidExtractorException, "First argument (rule name) is required" unless name
253
+ extractor = case extractor
254
+ when NilClass
255
+ # Absent extractor: either block if provided, otherwise default extractor
256
+ proc ? proc : default_extractor
257
+ when String
258
+ # Extractor expression
259
+ Extractor.new(extractor)
260
+ else
261
+ raise InvalidExtractorException, "Invalid extractor type: must be a string, parser class, block or nil"
262
+ end
263
+ # Create a class method using the extractor name that calls the
264
+ # extractor's extract method.
265
+ class << self
266
+ self
267
+ end.instance_eval { define_method(name) { |node| extractor.call(node) } }
268
+ end
269
+
270
+ # Returns the default extractor.
271
+ def default_extractor()
272
+ return DEFAULT_EXTRACTOR
273
+ end
274
+
275
+ # Called to parse a node.
276
+ #
277
+ # The node may be an element (REXML::Element) or a document (REXML::Document).
278
+ #
279
+ # For example:
280
+ # class ParseLinks
281
+ # include MicroformatParser
282
+ #
283
+ # rule :links, "a", "@href"
284
+ # rule :ids, "a[@id]", "a@id"
285
+ # end
286
+ #
287
+ # parsed = ParseLinks.parse(doc)
288
+ # puts parsed.links
289
+ # puts parsed.ids
290
+ def parse(node, context = nil, rules = nil)
291
+ # Create a new object unless one is provided. This method can be
292
+ # called on the class (creating a new instance) or on an object (recursive)
293
+ context = self.new() unless context
294
+ # Obtain the rules for this class unless provided by caller.
295
+ rules = self.rules unless rules
296
+ # Rules are reduced during processing. If a rule matches a node, that rule
297
+ # is not applied to any child nodes (structured rules will process child nodes
298
+ # directly). However, other rules are allowed to process the child nodes.
299
+ # Removing a rule modifies the ruleset, requiring it to be cloned.
300
+ less_rules = nil
301
+ # We must have rules and the node must be an element/document
302
+ if rules and node.kind_of? REXML::Element
303
+ # Iterate over all the rules and process them. Remove any matching rules
304
+ # from this ruleset -- the new ruleset will be used on child nodes.
305
+ rules.each_with_index do |rule, index|
306
+ if rule and rule.process(node, context)
307
+ less_rules = rules.clone unless less_rules
308
+ less_rules[index] = nil
309
+ end
310
+ end
311
+ rules = less_rules if less_rules
312
+ node.elements.each { |child| parse(child, context, less_rules) }
313
+ end
314
+ context
315
+ end
316
+
317
+ # Returns all the rules for this class.
318
+ #
319
+ # Returns an array of rules defined with +rule+.
320
+ #
321
+ # You can use this method to inspect rules, add/remove rules, etc. Rules are
322
+ # processed in the order in which they are added.
323
+ def rules
324
+ rules = @microparser_rules
325
+ @microparser_rules = rules = Array.new() unless rules
326
+ rules
327
+ end
328
+
329
+ # Returns the text value of a node.
330
+ def text(node)
331
+ value = ''
332
+ for child in node.children
333
+ if child.instance_of? REXML::Text
334
+ value += child.value
335
+ elsif child.instance_of? REXML::Element
336
+ value += text(child)
337
+ end
338
+ end
339
+ value
340
+ end
341
+
342
+ # Returns the XML value of a node (the node itself).
343
+ def xml(node)
344
+ node
345
+ end
346
+
347
+ module_function :text, :xml
348
+
349
+
350
+
351
+ private
352
+
353
+ def self.included(mod)
354
+ mod.extend(self)
355
+ end
356
+
357
+
358
+ # Implements a rule.
359
+ #
360
+ # A rule identifies matching nodes using a selector, and a means to extract their value
361
+ # using an extractor. The rule also identifies an instance variable and attribute accessor
362
+ # to retrieve the extracted value, and the cardinality of that value.
363
+ #
364
+ # For more information see MicroformatParser.rule.
365
+ class Rule
366
+
367
+ # The instance variable/attribute accessor name.
368
+ attr :name
369
+ # The rule cardinality (or value limit)
370
+ # 0:: No value (disabled)
371
+ # 1:: First value extracted
372
+ # n:: Up to n values (array)
373
+ # -1:: Unbound (array)
374
+ attr :limit,true
375
+ # The rule selector
376
+ attr :selector
377
+ # The rule extractor
378
+ attr :extractor
379
+
380
+ def initialize(name, selector, extractor, limit)
381
+ # Change the rule name to the attribute name holding the result
382
+ @name = "@#{name.to_s}".to_sym
383
+ @selector = selector
384
+ @extractor = extractor
385
+ @limit = limit
386
+ end
387
+
388
+ # Called to process this rule on a node with a context object.
389
+ #
390
+ # Returns true if the rule was processed and should be reduced (not applied to
391
+ # any child nodes). Otherwise, returns false.
392
+ def process(node, context)
393
+ # Do nothing if rule is disabled (limit = 0), reduce it.
394
+ return true if @limit == 0
395
+ # Do nothing if rule is singular (limit = 1) and a value was already set
396
+ current = context.instance_variable_get(@name)
397
+ return true if @limit == 1 and current
398
+ # Match the current node, do nothing if not matched
399
+ # (Selector may be nil if rule created to wrap a proc)
400
+ if @selector
401
+ return false unless @selector.instance_of?(UnboundMethod) ? @selector.bind(context).call(node) :
402
+ @selector.instance_of?(Selector) ? @selector.match(node) : @selector.call(node)
403
+ end
404
+ # Extract the value. Do nothing if nothing extracted
405
+ value = case @extractor
406
+ when UnboundMethod
407
+ @extractor.bind(context).call(node)
408
+ when Extractor
409
+ @extractor.extract(node)
410
+ when Proc, Method
411
+ @extractor.call(node)
412
+ when Class
413
+ @extractor.parse(node)
414
+ end
415
+ return false unless value
416
+ # If limit=1, set the new value (singular)
417
+ # If no current value, create new array with new value
418
+ # Otherwise, if no limit or limit not reach, append value to
419
+ # existing array
420
+ if @limit == 1
421
+ context.instance_variable_set(@name, value)
422
+ elsif not current
423
+ context.instance_variable_set(@name, [value])
424
+ elsif current.instance_of? Array and (@limit < 0 or current.size < @limit)
425
+ current << value
426
+ end
427
+ # We always return true, since there's no point in applying
428
+ # the rule to any child nodes.
429
+ return true
430
+ end
431
+
432
+ def inspect
433
+ @selector ? "[to #{@name} from #{@selector.inspect}, #{@extractor.inspect}, limit #{@limit}]" : "[to #{@name} from #{@extractor.inspect}, limit #{@limit}]"
434
+ end
435
+
436
+ end
437
+
438
+
439
+ # Implements a selector using a CSS-style expression.
440
+ #
441
+ # For more information see MicroformatParser.selector.
442
+ class Selector
443
+
444
+ # :stopdoc:
445
+
446
+ # Parse each selector into five parts:
447
+ # $1 element name or * (optional)
448
+ # $2 ID name (including leading #, optional)
449
+ # $3 class names (including leading ., zero or more)
450
+ # $4 attribute expressions (zero or more)
451
+ # $5 anything else (no leading spaces)
452
+ REGEX = /^(\*|[A-Za-z][A-Za-z0-9_\-:]*)?(#[A-Za-z][A-Za-z0-9_\-:]*)?((?:\.[A-Za-z][A-Za-z0-9_\-:]*){0,})((?:\[[A-Za-z][A-Za-z0-9_\-:]*(?:(?:~|\|)?=.*)?\]){0,})\s*(.*)$/
453
+
454
+ # Parse each attribute expression into three parts:
455
+ # $1 attribute name
456
+ # $2 matching operation
457
+ # $3 matched value
458
+ # Matching operation may be =, ~= or |=. Value may be empty.
459
+ ATTR_REGEX = /^([A-Za-z][A-Za-z0-9_\-:]*)((?:~|\|)?=)?(.*)$/
460
+
461
+ # :startdoc:
462
+
463
+ def initialize(tag_name, attrs, alt = nil, &depends)
464
+ @tag_name = tag_name
465
+ @attrs = attrs
466
+ @alt = alt
467
+ @depends = depends
468
+ end
469
+
470
+
471
+ public
472
+ # Creates a new selector.
473
+ def Selector.create(statement, alt = nil, &depends)
474
+ statement.strip!
475
+ # Parse the first selector expression into $1-$4, anything else goes in $5
476
+ parts = REGEX.match(statement)
477
+ raise InvalidSelectorException, "Invalid (empty) selector statement" if parts[0].length == 0
478
+
479
+ # Set tag_name to the element name if specified and not *
480
+ tag_name = parts[1] if parts[1] and !parts[1].empty? and parts[1] != '*'
481
+ # This array holds the regular expressions for matching attributes.
482
+ # We use an array since we allow multiple expressions on the same attribute,
483
+ # e.g. to find an element with both class 'foo' and class 'bar'.
484
+ attrs = []
485
+ # Match the ID attribute if specified
486
+ attrs << ['id', Regexp.new('^' + parts[2] + '$')] if parts[2]
487
+ # The third part is a collection of class names, prefixed with dot
488
+ # Create an attribute matching regular expression for each class
489
+ # The class attribute is a set of space-separated names, so match accordingly
490
+ if !parts[3].empty?
491
+ parts[3].split('.').each { |cls| attrs << ['class', Regexp.new('\b' + cls + '\b')] if !cls.empty? }
492
+ end
493
+ # Process the remaining attribute expressions. Each expression is enclosed
494
+ # within square brackets, so split the expressions into anything between the
495
+ # square brackets. The result may include empty elements, skip those.
496
+ parts[4].split(/\[|\]/).each do |expr|
497
+ if not expr.empty?
498
+ # Parse the attribute expression and created a regular expression
499
+ # for matching the attribute value, based on the operation.
500
+ name, type, value = ATTR_REGEX.match(expr)[1..3]
501
+ case type
502
+ when '=' then
503
+ # Match the attribute value in full
504
+ match = Regexp.new('^' + value + '$')
505
+ when '~=' then
506
+ # Match a space-separated word within the attribute value
507
+ match = Regexp.new('\b' + value + '\b')
508
+ when '|=' then
509
+ # Match the beginning of the attribute value
510
+ match = Regexp.new('^' + value)
511
+ else
512
+ # Match all attributes values (existence check)
513
+ match = Regexp.new('')
514
+ end
515
+ attrs << [name, match]
516
+ end
517
+ end
518
+ # If there's nothing else in the statement, return this selector.
519
+ selector = Selector.new(tag_name, attrs, alt, &depends)
520
+ return selector if parts[5].empty?
521
+
522
+ # Create a compound selector based on the remainder of the statement.
523
+ # This is also why we need the factory and can't call new directly.
524
+ return case parts[5][0]
525
+ when ?,
526
+ # Alternative selector: second statement is alternative to the first one
527
+ Selector.create(parts[5][1..-1], selector)
528
+ when ?+
529
+ # Sibling selector: second statement is returned that will match node
530
+ # followed by previous sibling node based on first statement
531
+ Selector.create(parts[5][1..-1]) do |node|
532
+ node.previous_element and selector.match(node.previous_element)
533
+ end
534
+ when ?>
535
+ # Child selector: second statement is returned that will match node
536
+ # followed by parent node based on the first statement
537
+ Selector.create(parts[5][1..-1]) do |node|
538
+ node.parent? and selector.match(node.parent)
539
+ end
540
+ else
541
+ # Descendant selector: second statement is returned that will match node
542
+ # followed by ascendant node based on the first statement
543
+ Selector.create(parts[5]) do |node|
544
+ parent = node.parent
545
+ match = false
546
+ while parent
547
+ break if match = selector.match(parent)
548
+ parent = parent.parent
549
+ end
550
+ match
551
+ end
552
+ end
553
+ end
554
+
555
+ # Creates a new selector for the given class name.
556
+ def Selector.for_class(cls)
557
+ Selector.new(nil, [["class", Regexp.new('\b' + cls + '\b')]])
558
+ end
559
+
560
+ # Identifies all matching nodes.
561
+ def match(node)
562
+ # Match node if no element name or element name same as node name
563
+ match = (!@tag_name or @tag_name == node.name)
564
+ # No match if one of the attribute matches failed
565
+ for attr in @attrs
566
+ if attr[1] !~ node.attributes[attr[0]]
567
+ match = false
568
+ break
569
+ end
570
+ end
571
+ # If the node did not match, but we have an alternative match
572
+ # (x+y), apply the alternative match instead
573
+ return @alt.match(node) if not match and @alt
574
+ # If the node did match, but depends on another match (parent,
575
+ # sibling, etc), apply the dependent match as well
576
+ return @depends.call(node) if match and @depends
577
+ match
578
+ end
579
+
580
+ def inspect
581
+ stmt = @tag_name ? @tag_name : '';
582
+ @attrs.each do |attr|
583
+ stmt += "[#{attr[0]}"
584
+ stmt += "~=#{$1}" if attr[1].source =~ /^\\b(.*)\\b$/
585
+ stmt += "=#{$1}" if attr[1].source =~ /^\^(.*)\$$/
586
+ stmt += "|=#{$1}" if attr[1].source =~ /^\^[^$]*$/
587
+ end
588
+ stmt += ']'
589
+ stmt += ',' + @alt.inspect if @alt
590
+ stmt
591
+ end
592
+
593
+ end
594
+
595
+
596
+ # Implements an extractor using a simple expression format.
597
+ #
598
+ # For more information see MicroformatParser.extractor.
599
+ class Extractor
600
+
601
+ # :stopdoc:
602
+
603
+ # Parse each extractor into three parts:
604
+ # $1 function name (excluding parentheses)
605
+ # $2 element name
606
+ # $3 attribute name (including leading @)
607
+ # If a match is found the result is either $1, or $2 and/or $3
608
+ REGEX = /^(\w+)\(\)|([A-Za-z][A-Za-z0-9_\-:]*)?(@[A-Za-z][A-Za-z0-9_\-:]*)?$/
609
+
610
+ # :startdoc:
611
+
612
+ def initialize(context, statement)
613
+ statement.strip!
614
+ @extracts = []
615
+ # Break the statement into multiple extraction rules, separated by |.
616
+ statement.split('|').each do |extract|
617
+ parts = REGEX.match(extract)
618
+ if parts[1] then
619
+ # Function. Find a method in the context object (the rule class),
620
+ # report an error is not found.
621
+ begin
622
+ @extracts << context.method(parts[1]) # context.
623
+ rescue NameError=>error
624
+ raise InvalidExtractorException, error.message, error.backtrace
625
+ end
626
+ elsif parts[2] and parts[3]
627
+ # Apply only if element of this type, and extract the named attribute.
628
+ attr_name = parts[3][1..-1]
629
+ @extracts << proc { |node| node.attributes[attr_name] if node.name == parts[2] }
630
+ elsif parts[2]
631
+ # Apply only if element of this type, and extract the text value.
632
+ @extracts << proc { |node| text(node) if node.name == parts[2] }
633
+ elsif parts[3]
634
+ # Extract the named attribute.
635
+ attr_name = parts[3][1..-1]
636
+ @extracts << proc { |node| node.attributes[attr_name] }
637
+ else
638
+ raise InvalidExtractorException, "Invalid extraction statement"
639
+ end
640
+ end
641
+ raise InvalidExtractorException, "Invalid (empty) extraction statement" if @extracts.size == 0
642
+ end
643
+
644
+ public
645
+ # Extracts a value from the node based on the extractor expression.
646
+ def extract(node)
647
+ # Iterate over all extraction rules, returning the first value.
648
+ value = nil
649
+ @extracts.each do |extract|
650
+ value = extract.call(node)
651
+ break if value
652
+ end
653
+ value
654
+ end
655
+
656
+ def inspect
657
+ @extracts.join('|')
658
+ end
659
+
660
+ end
661
+
662
+ DEFAULT_EXTRACTOR = Extractor.new(self, "abbr@title|a@href|text()")
663
+
664
+ # Base class for InvalidSelectorException and InvalidExtractorException.
665
+ # Also raised when a rule is defined with invalid arguments.
666
+ class InvalidRuleException < Exception
667
+ end
668
+
669
+ # Raised to indicate an invalid selector statement.
670
+ class InvalidSelectorException < InvalidRuleException
671
+ end
672
+
673
+ # Raised to indicate an invalid extractor statement.
674
+ class InvalidExtractorException < InvalidRuleException
675
+ end
676
+
677
+
678
+ end
679
+
680
+
681
+ # A parser for several microformats.
682
+ #
683
+ # Defines rules for the following attributes:
684
+ # tags:: A list of tags based on relTag (array of String)
685
+ # events:: A list of events based on hCalendar (array of HCalendar)
686
+ #
687
+ # For example:
688
+ # content = Microformats.parse(doc)
689
+ # puts "Tagged with " + content.tags.join(", ") if content.tags
690
+ class Microformats
691
+
692
+ include MicroformatParser
693
+
694
+ # Parses the fields of an hCalendar element.
695
+ #
696
+ # Defines rules for the following attributes:
697
+ # dtstart:: The event's start date/time (a Time object)
698
+ # dtend:: The event's end date/time (a Time object)
699
+ # summary:: The event's summary (text value)
700
+ # description:: The event's description (XML node)
701
+ # url:: The event's URL (string)
702
+ #
703
+ # For example:
704
+ # content = Microformats.parse(doc)
705
+ # content.events.each do |event}
706
+ # put "Event on " + event.dtstart
707
+ # end
708
+ class HCalendar
709
+
710
+ include MicroformatParser
711
+
712
+ extractor :dt_extractor do |node|
713
+ value = node.attributes['title'] if node.name == 'abbr'
714
+ value = text(node) unless value
715
+ value ? Time.parse(value) : nil
716
+ end
717
+
718
+ rule_1 :dtstart, nil, :dt_extractor
719
+ rule_1 :dtend, nil, :dt_extractor
720
+ rule_1 :summary, nil, :text
721
+ rule_1 :description, nil, :xml
722
+ rule_1 :url, nil, "a@href"
723
+ rule_1 :location, nil, :xml
724
+ rule_1 :contact, nil, :xml
725
+
726
+ end
727
+
728
+ rule :tags, "a[rel~=tag]", "text()"
729
+ rule :events, ".vevent", HCalendar
730
+
731
+ end
metadata ADDED
@@ -0,0 +1,50 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.3
3
+ specification_version: 1
4
+ name: uformatparser
5
+ version: !ruby/object:Gem::Version
6
+ version: 1.0.0
7
+ date: 2005-11-20
8
+ summary: Microformat parser for extracting microcontent from (X)HTML
9
+ require_paths:
10
+ - lib
11
+ email: assaf@labnotes.org
12
+ homepage: http://trac.labnotes.org/cgi-bin/trac.cgi/wiki/Ruby/MicroformatParser
13
+ rubyforge_project: uformatparser
14
+ description: "Parser for extracting microcontent from (X)HTML documents, in any number of
15
+ microformats. Uses a DSL for specifying the parsing rules as a set of selectors
16
+ and extractors. Supports a CSS-like selector and XPath-like extractor syntaxes
17
+ for quick and easy rule writing. Also supports reusable and compound rules,"
18
+ autorequire: uformatparser.rb
19
+ default_executable:
20
+ bindir: bin
21
+ has_rdoc: true
22
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
23
+ requirements:
24
+ -
25
+ - ">"
26
+ - !ruby/object:Gem::Version
27
+ version: 0.0.0
28
+ version:
29
+ platform: ruby
30
+ authors:
31
+ - Assaf Arkin
32
+ files:
33
+ - lib/uformatparser.rb
34
+ - README
35
+ - Rakefile
36
+ - MIT-LICENSE
37
+ test_files: []
38
+ rdoc_options:
39
+ - "--main"
40
+ - README
41
+ - "--title"
42
+ - Microformat parser
43
+ - "--line-numbers"
44
+ extra_rdoc_files:
45
+ - README
46
+ executables: []
47
+ extensions: []
48
+ requirements:
49
+ - ReXML. HTML->ReXML parser.
50
+ dependencies: []