uformatparser 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (5) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +44 -0
  3. data/Rakefile +74 -0
  4. data/lib/uformatparser.rb +731 -0
  5. metadata +50 -0
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2005 Assaf Arkin
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,44 @@
1
+ = Microformat Parser
2
+
3
+ MicroformatParser is a Ruby module for creating microformat parsers.
4
+ A microformat parser is a class with a set of rules for extracting
5
+ interesting content from (X)HTML documents. You create your own parser
6
+ by writing a class with a set of rules. The magic happens in the parse
7
+ method which taks an (X)HTML document or element, runs all the rules
8
+ on it, and returns new object that holds the extracted valus.
9
+
10
+ Here's a simple example to find all links and all tags in a document:
11
+
12
+ class MyParser
13
+ include MicroformatParser
14
+
15
+ rule :links, "a", "a@href"
16
+ rule :tags, "a[rel~=tag]", "text()"
17
+ end
18
+
19
+ content = MyParser.parse(doc)
20
+ puts "Found " + content.links.size + " links" if content.links
21
+ puts "Tagged with " + content.tags.join(', ') if content.tags
22
+
23
+
24
+ == Documentation
25
+
26
+ You may want to read the documentation for a more details discussion of
27
+ selectors, extractors, compound rules, (X)HTML parsing and examples
28
+
29
+ http://trac.labnotes.org/cgi-bin/trac.cgi/wiki/Ruby/MicroformatParser
30
+
31
+
32
+ == Download
33
+
34
+ The latest version of can be found at
35
+
36
+ http://rubyforge.org/projects/uformatparser/
37
+
38
+ == License
39
+
40
+ This package is licensed under the MIT license and/or the {Creative
41
+ Commons Attribution-ShareAlike}[http://creativecommons.org/licenses/by-sa/2.5/legalcode].
42
+
43
+ :include: MIT-LICENSE
44
+
@@ -0,0 +1,74 @@
1
+ # Adapted from the rake Rakefile.
2
+
3
+ require 'rubygems'
4
+ Gem::manage_gems
5
+ require 'rake/testtask'
6
+ require 'rake/rdoctask'
7
+ require 'rake/gempackagetask'
8
+
9
+
10
+ desc "Default Task"
11
+ task :default => [:tests, :rdoc]
12
+
13
+
14
+ Rake::TestTask.new :tests do |test|
15
+ test.verbose = true
16
+ test.test_files = ['test/*.rb']
17
+ end
18
+
19
+
20
+ # Create the documentation.
21
+ Rake::RDocTask.new do |rdoc|
22
+ rdoc.main = "README"
23
+ rdoc.rdoc_files.include("README", "lib/**/*.rb")
24
+ rdoc.title = 'Microformat Parser'
25
+ end
26
+
27
+
28
+ # Create the GEM package.
29
+ gem_spec = Gem::Specification.new do |spec|
30
+ spec.name = 'uformatparser'
31
+ spec.version = "1.0.0"
32
+ spec.summary = "Microformat parser for extracting microcontent from (X)HTML"
33
+ spec.description = <<-EOF
34
+ Parser for extracting microcontent from (X)HTML documents, in any number
35
+ of microformats.
36
+
37
+ Uses a DSL for specifying the parsing rules as a set of selectors and
38
+ extractors. Supports a CSS-like selector and XPath-like extractor syntaxes
39
+ for quick and easy rule writing. Also supports reusable and compound rules,
40
+ EOF
41
+ spec.author = "Assaf Arkin"
42
+ spec.email = "assaf@labnotes.org"
43
+ spec.homepage = "http://trac.labnotes.org/cgi-bin/trac.cgi/wiki/Ruby/MicroformatParser"
44
+
45
+ spec.files = FileList["{test,lib}/**/*", "README", "Rakefile", "MIT-LICENSE"].to_a
46
+ spec.require_path = "lib"
47
+ spec.autorequire = 'uformatparser.rb'
48
+ spec.requirements << "ReXML. HTML->ReXML parser."
49
+ spec.has_rdoc = true
50
+ spec.rdoc_options << '--main' << 'README' << '--title' << 'Microformat parser' << '--line-numbers'
51
+ spec.extra_rdoc_files = ["README"]
52
+ spec.rubyforge_project = "uformatparser"
53
+ end
54
+
55
+ gem = Rake::GemPackageTask.new(gem_spec) do |pkg|
56
+ pkg.need_tar = true
57
+ pkg.need_zip = true
58
+ end
59
+
60
+
61
+ # --------------------------------------------------------------------
62
+ # Creating a release
63
+
64
+ desc "Make a new release"
65
+ task :release => [:tests, :clobber, :package] do
66
+ puts
67
+ puts "**************************************************************"
68
+ puts "* Release #{gem_spec.version} Complete."
69
+ puts "* Packages ready to upload."
70
+ puts "**************************************************************"
71
+ puts
72
+ end
73
+
74
+
@@ -0,0 +1,731 @@
1
+ #
2
+ # = uformatparser.rb - Microformat parser
3
+ #
4
+ #--
5
+ # Author:: Assaf Arkin assaf@labnotes.org
6
+ # Documentation:: http://trac.labnotes.org/cgi-bin/trac.cgi/wiki/Ruby/MicroformatParser
7
+ # Copyright:: Copyright (c) 2005 Assaf Arkin
8
+ # License:: Creative Commons Attribution-ShareAlike
9
+ #
10
+ #++
11
+
12
+ require 'rexml/document'
13
+
14
+
15
+ # Implements a microformat parser by extending a class that includes this module.
16
+ #
17
+ # === The Basics
18
+ #
19
+ # To create a microformat parser, extend a class with this module and use the
20
+ # +rule+ method to define parsing rules for that class. Call +parse+ to parse the
21
+ # content, returning a new instance of the class holding all values extracted from
22
+ # parsing. You can parse a document or an element.
23
+ #
24
+ # For example:
25
+ # class Microformats
26
+ # include MicroformatParser
27
+ #
28
+ # class HCalendar
29
+ # include MicroformatParser
30
+ #
31
+ # # Extract ISO date/time
32
+ # extractor :dt_extractor do |node|
33
+ # value = node.attributes['title'] if node.name == 'abbr'
34
+ # value = text(node) unless value
35
+ # value ? Time.parse(value) : nil
36
+ # end
37
+ #
38
+ # rule_1 :dtstart, nil, :dt_extractor
39
+ # rule_1 :dtend, nil, :dt_extractor
40
+ # rule_1 :summary, nil, :text
41
+ # rule_1 :description, nil, :xml
42
+ # rule_1 :url, nil, "a@href"
43
+ # end
44
+ #
45
+ # rule :tags, "a[rel~=tag]", "text()"
46
+ # rule :events, ".vevent", HCalendar
47
+ # end
48
+ #
49
+ # content = Microformats.parse(doc)
50
+ # puts content.tags
51
+ # puts content.events
52
+ #
53
+ module MicroformatParser
54
+
55
+
56
+ # Create a new rule.
57
+ #
58
+ # There are two ways to define a rule:
59
+ # * rule name, selector?, extractor?, limit?
60
+ # * rule name, limit? { block }
61
+ #
62
+ # The +name+ argument specifies an instance variable that holds the value
63
+ # (or values) extracted from processing this rule. It can be a string or
64
+ # a symbol. An attribute accessor is created with that name.
65
+ #
66
+ # The +selector+ argument identifies all nodes that match the rule. It can
67
+ # be an CSS-style selector (string) or a method/proc. A symbol specifies
68
+ # a method to use from this class. The method/proc receives a single argument
69
+ # with the node and must return true/false.
70
+ #
71
+ # If selector is absent, the default selector will match any element with
72
+ # a class of the same name as the name argument. For example:
73
+ # rule :dtstart
74
+ # Matches all elements with the class _dtstart_.
75
+ #
76
+ # The +extractor+ argument specifies how to extract a value from a selected
77
+ # node. It can be a list of extract rules (string), a method/proc, or a class.
78
+ # A symbol specifies a method to use from this class. The method/proce receives
79
+ # a single argument with the node and returns the extracted value, or nil.
80
+ #
81
+ # If the extractor is a class, it references a microformat parser which is
82
+ # then called to parse the content of a matching element.
83
+ #
84
+ # If extractor is absent, the default extractor is used:
85
+ # abbr@title|a@href|text()
86
+ #
87
+ # The +limit+ argument specifies the cardinality of the rule's value:
88
+ # 0 The rule is never applied
89
+ # 1 The rule is applied once, the first extracted value is set
90
+ # -1 The rule is applied multiple times, extracted values are set in an array
91
+ # n The rule is applied up to _n_ times, extracted values are set in an array
92
+ #
93
+ # In the second form, a block is specified instead of the selector/extractor.
94
+ # The block is called with a node and returns the extracted value, or nil.
95
+ def rule(name, selector = nil, extractor = nil, limit = -1, &proc)
96
+ raise InvalidRuleException, "First argument (rule name) is required" unless name
97
+ if proc
98
+ # The rule processing is taken from the block, everything else must be nil
99
+ raise InvalidRuleException, "Can't specify selector/extractor in combination with proc" if selector or extractor
100
+ rule = Rule.new(name, nil, proc, limit)
101
+ else
102
+ # Determine the selector.
103
+ selector = case selector
104
+ when NilClass
105
+ # Absent selector: create a selector that matches element with the same
106
+ # class as the rule name
107
+ match = Regexp.new("\\b#{name.to_s}\\b")
108
+ proc { |node| node.attributes['class'] =~ match }
109
+ when String
110
+ # CSS-style selector
111
+ Selector.create(selector)
112
+ when Proc, Method
113
+ # Use as is
114
+ selector
115
+ when Symbol
116
+ # Find named method and use that as the selector
117
+ # Since the instance method is unbound, we bind it to this class
118
+ selector = method(selector)
119
+ raise InvalidSelectorException, "Method #{name.to_s} is not a valid selector" unless selector
120
+ selector
121
+ else
122
+ raise InvalidSelectorException, "Invalid selector type: must be a string, symbol, proc/method or nil"
123
+ end
124
+
125
+ # Determine the extractor
126
+ extractor = case extractor
127
+ when NilClass
128
+ # Absent extractor: either block if provided, otherwise default extractor
129
+ default_extractor
130
+ when String
131
+ # Extractor expression
132
+ Extractor.new(self, extractor)
133
+ when Proc, Method
134
+ # Use as is
135
+ extractor
136
+ when Symbol
137
+ # Find named method and use that as the extractor
138
+ # Since the instance method is unbound, we bind it to this class
139
+ extractor = method(extractor)
140
+ raise InvalidExtractorException, "Method #{name.to_s} is not a valid extractor" unless extractor
141
+ extractor
142
+ when Class
143
+ # Extractor is a class, generally another ruleset, so we call
144
+ # its parse method (must exist).
145
+ begin
146
+ extractor.method(:parse)
147
+ rescue NameError=>error
148
+ raise InvalidExtractorException, "Extractor class must implement the method parse", error.backtrace
149
+ end
150
+ extractor
151
+ else
152
+ raise InvalidExtractorException, "Invalid extractor type: must be a string, parser class, block or nil"
153
+ end
154
+
155
+ # Create a new rule, to invoke its process method
156
+ rule = Rule.new(name, selector, extractor, limit)
157
+ end
158
+
159
+ # Create an accessor for an attribute with the same name as the rule
160
+ # The accessor will hold the rule value
161
+ attr name, true
162
+ # Add this rule to class's ruleset
163
+ self.rules << rule
164
+ end
165
+
166
+
167
+ # Create a new rule that extracts at most one value.
168
+ #
169
+ # Same as calling +rule+ with +limit+=1
170
+ def rule_1(name, selector = nil, extractor = nil, &proc)
171
+ # Rule with limit of one value
172
+ rule(name, selector, extractor, 1, &proc)
173
+ end
174
+
175
+
176
+ # Creates a new selector.
177
+ #
178
+ # There are two ways to create a selector:
179
+ # * selector name, statement
180
+ # * selector name { block }
181
+ #
182
+ # The +name+ argument (a string or symbol) specifies the selector name,
183
+ # defining a class method with that name that can be used to identify matching
184
+ # element.
185
+ #
186
+ # The selector can be a CSS-style selector (string) or a block that accepts a
187
+ # single argument (element) and returns true or false.
188
+ #
189
+ # For example:
190
+ # selector :select_link { |node| node.name == 'a' }
191
+ # extractor :extract_link { |node| node.attributes['href'] }
192
+ # rule :links, :select_link, :extract_link
193
+ def selector(name, selector = nil, &proc)
194
+ raise InvalidSelectorException, "First argument (rule name) is required" unless name
195
+ selector = case selector
196
+ when NilClass
197
+ # Absent selector: either block is provided, or we create a selector
198
+ # that matches element with the same class as the selector name
199
+ if proc
200
+ proc
201
+ else
202
+ match = Regexp.new("\\b#{name.to_s}\\b")
203
+ proc { |node| node.attributes['class'] =~ match }
204
+ end
205
+ when String
206
+ # CSS-style selector
207
+ Selector.create(selector)
208
+ else
209
+ raise InvalidSelectorException, "Invalid selector type: must be a string, block or nil"
210
+ end
211
+ # Create a class method using the selector name that calls the
212
+ # selector's match method.
213
+ class << self
214
+ self
215
+ end.instance_eval { define_method(name) { |node| selector.call(node) } }
216
+ end
217
+
218
+
219
+ # Creates a new extractor.
220
+ #
221
+ # There are two ways to create an extractor:
222
+ # * extractor name, statement
223
+ # * extractor selector name { block }
224
+ #
225
+ # The +name+ argument (string or symbol) specifies the extractor name,
226
+ # defining a class method with that name that can be used to extract the
227
+ # value of a node.
228
+ #
229
+ # The extractor can be an expression (string) or a block that accepts a
230
+ # single argument (element) and returns the extracted value, or nil.
231
+ #
232
+ # For example:
233
+ # selector :select_link { |node| node.name == 'a' }
234
+ # extractor :extract_link { |node| node.attributes['href'] }
235
+ # rule :links, :select_link, :extract_link
236
+ #
237
+ # The expression takes the form of:
238
+ # extractor := extract (|extract)*
239
+ # extract := element | @attribute | element@attribute | method()
240
+ #
241
+ # If multiple extracts are specified, the first extracted value is used.
242
+ #
243
+ # If an element is specified, the text value is extracted only if the selected
244
+ # node is an element of that type. If an attribute is specified, the extracted
245
+ # value is the attribute's value. If both element and attribute are used, the
246
+ # attribute value is extracted only if the selected node is an element of that
247
+ # type.
248
+ #
249
+ # If a method is specified, that method is called for the node. There are two
250
+ # methods available in any class: +text+ and +xml+.
251
+ def extractor(name, extractor = nil, &proc)
252
+ raise InvalidExtractorException, "First argument (rule name) is required" unless name
253
+ extractor = case extractor
254
+ when NilClass
255
+ # Absent extractor: either block if provided, otherwise default extractor
256
+ proc ? proc : default_extractor
257
+ when String
258
+ # Extractor expression
259
+ Extractor.new(extractor)
260
+ else
261
+ raise InvalidExtractorException, "Invalid extractor type: must be a string, parser class, block or nil"
262
+ end
263
+ # Create a class method using the extractor name that calls the
264
+ # extractor's extract method.
265
+ class << self
266
+ self
267
+ end.instance_eval { define_method(name) { |node| extractor.call(node) } }
268
+ end
269
+
270
+ # Returns the default extractor.
271
+ def default_extractor()
272
+ return DEFAULT_EXTRACTOR
273
+ end
274
+
275
+ # Called to parse a node.
276
+ #
277
+ # The node may be an element (REXML::Element) or a document (REXML::Document).
278
+ #
279
+ # For example:
280
+ # class ParseLinks
281
+ # include MicroformatParser
282
+ #
283
+ # rule :links, "a", "@href"
284
+ # rule :ids, "a[@id]", "a@id"
285
+ # end
286
+ #
287
+ # parsed = ParseLinks.parse(doc)
288
+ # puts parsed.links
289
+ # puts parsed.ids
290
+ def parse(node, context = nil, rules = nil)
291
+ # Create a new object unless one is provided. This method can be
292
+ # called on the class (creating a new instance) or on an object (recursive)
293
+ context = self.new() unless context
294
+ # Obtain the rules for this class unless provided by caller.
295
+ rules = self.rules unless rules
296
+ # Rules are reduced during processing. If a rule matches a node, that rule
297
+ # is not applied to any child nodes (structured rules will process child nodes
298
+ # directly). However, other rules are allowed to process the child nodes.
299
+ # Removing a rule modifies the ruleset, requiring it to be cloned.
300
+ less_rules = nil
301
+ # We must have rules and the node must be an element/document
302
+ if rules and node.kind_of? REXML::Element
303
+ # Iterate over all the rules and process them. Remove any matching rules
304
+ # from this ruleset -- the new ruleset will be used on child nodes.
305
+ rules.each_with_index do |rule, index|
306
+ if rule and rule.process(node, context)
307
+ less_rules = rules.clone unless less_rules
308
+ less_rules[index] = nil
309
+ end
310
+ end
311
+ rules = less_rules if less_rules
312
+ node.elements.each { |child| parse(child, context, less_rules) }
313
+ end
314
+ context
315
+ end
316
+
317
+ # Returns all the rules for this class.
318
+ #
319
+ # Returns an array of rules defined with +rule+.
320
+ #
321
+ # You can use this method to inspect rules, add/remove rules, etc. Rules are
322
+ # processed in the order in which they are added.
323
+ def rules
324
+ rules = @microparser_rules
325
+ @microparser_rules = rules = Array.new() unless rules
326
+ rules
327
+ end
328
+
329
+ # Returns the text value of a node.
330
+ def text(node)
331
+ value = ''
332
+ for child in node.children
333
+ if child.instance_of? REXML::Text
334
+ value += child.value
335
+ elsif child.instance_of? REXML::Element
336
+ value += text(child)
337
+ end
338
+ end
339
+ value
340
+ end
341
+
342
+ # Returns the XML value of a node (the node itself).
343
+ def xml(node)
344
+ node
345
+ end
346
+
347
+ module_function :text, :xml
348
+
349
+
350
+
351
+ private
352
+
353
+ def self.included(mod)
354
+ mod.extend(self)
355
+ end
356
+
357
+
358
+ # Implements a rule.
359
+ #
360
+ # A rule identifies matching nodes using a selector, and a means to extract their value
361
+ # using an extractor. The rule also identifies an instance variable and attribute accessor
362
+ # to retrieve the extracted value, and the cardinality of that value.
363
+ #
364
+ # For more information see MicroformatParser.rule.
365
+ class Rule
366
+
367
+ # The instance variable/attribute accessor name.
368
+ attr :name
369
+ # The rule cardinality (or value limit)
370
+ # 0:: No value (disabled)
371
+ # 1:: First value extracted
372
+ # n:: Up to n values (array)
373
+ # -1:: Unbound (array)
374
+ attr :limit,true
375
+ # The rule selector
376
+ attr :selector
377
+ # The rule extractor
378
+ attr :extractor
379
+
380
+ def initialize(name, selector, extractor, limit)
381
+ # Change the rule name to the attribute name holding the result
382
+ @name = "@#{name.to_s}".to_sym
383
+ @selector = selector
384
+ @extractor = extractor
385
+ @limit = limit
386
+ end
387
+
388
+ # Called to process this rule on a node with a context object.
389
+ #
390
+ # Returns true if the rule was processed and should be reduced (not applied to
391
+ # any child nodes). Otherwise, returns false.
392
+ def process(node, context)
393
+ # Do nothing if rule is disabled (limit = 0), reduce it.
394
+ return true if @limit == 0
395
+ # Do nothing if rule is singular (limit = 1) and a value was already set
396
+ current = context.instance_variable_get(@name)
397
+ return true if @limit == 1 and current
398
+ # Match the current node, do nothing if not matched
399
+ # (Selector may be nil if rule created to wrap a proc)
400
+ if @selector
401
+ return false unless @selector.instance_of?(UnboundMethod) ? @selector.bind(context).call(node) :
402
+ @selector.instance_of?(Selector) ? @selector.match(node) : @selector.call(node)
403
+ end
404
+ # Extract the value. Do nothing if nothing extracted
405
+ value = case @extractor
406
+ when UnboundMethod
407
+ @extractor.bind(context).call(node)
408
+ when Extractor
409
+ @extractor.extract(node)
410
+ when Proc, Method
411
+ @extractor.call(node)
412
+ when Class
413
+ @extractor.parse(node)
414
+ end
415
+ return false unless value
416
+ # If limit=1, set the new value (singular)
417
+ # If no current value, create new array with new value
418
+ # Otherwise, if no limit or limit not reach, append value to
419
+ # existing array
420
+ if @limit == 1
421
+ context.instance_variable_set(@name, value)
422
+ elsif not current
423
+ context.instance_variable_set(@name, [value])
424
+ elsif current.instance_of? Array and (@limit < 0 or current.size < @limit)
425
+ current << value
426
+ end
427
+ # We always return true, since there's no point in applying
428
+ # the rule to any child nodes.
429
+ return true
430
+ end
431
+
432
+ def inspect
433
+ @selector ? "[to #{@name} from #{@selector.inspect}, #{@extractor.inspect}, limit #{@limit}]" : "[to #{@name} from #{@extractor.inspect}, limit #{@limit}]"
434
+ end
435
+
436
+ end
437
+
438
+
439
+ # Implements a selector using a CSS-style expression.
440
+ #
441
+ # For more information see MicroformatParser.selector.
442
+ class Selector
443
+
444
+ # :stopdoc:
445
+
446
+ # Parse each selector into five parts:
447
+ # $1 element name or * (optional)
448
+ # $2 ID name (including leading #, optional)
449
+ # $3 class names (including leading ., zero or more)
450
+ # $4 attribute expressions (zero or more)
451
+ # $5 anything else (no leading spaces)
452
+ REGEX = /^(\*|[A-Za-z][A-Za-z0-9_\-:]*)?(#[A-Za-z][A-Za-z0-9_\-:]*)?((?:\.[A-Za-z][A-Za-z0-9_\-:]*){0,})((?:\[[A-Za-z][A-Za-z0-9_\-:]*(?:(?:~|\|)?=.*)?\]){0,})\s*(.*)$/
453
+
454
+ # Parse each attribute expression into three parts:
455
+ # $1 attribute name
456
+ # $2 matching operation
457
+ # $3 matched value
458
+ # Matching operation may be =, ~= or |=. Value may be empty.
459
+ ATTR_REGEX = /^([A-Za-z][A-Za-z0-9_\-:]*)((?:~|\|)?=)?(.*)$/
460
+
461
+ # :startdoc:
462
+
463
+ def initialize(tag_name, attrs, alt = nil, &depends)
464
+ @tag_name = tag_name
465
+ @attrs = attrs
466
+ @alt = alt
467
+ @depends = depends
468
+ end
469
+
470
+
471
+ public
472
+ # Creates a new selector.
473
+ def Selector.create(statement, alt = nil, &depends)
474
+ statement.strip!
475
+ # Parse the first selector expression into $1-$4, anything else goes in $5
476
+ parts = REGEX.match(statement)
477
+ raise InvalidSelectorException, "Invalid (empty) selector statement" if parts[0].length == 0
478
+
479
+ # Set tag_name to the element name if specified and not *
480
+ tag_name = parts[1] if parts[1] and !parts[1].empty? and parts[1] != '*'
481
+ # This array holds the regular expressions for matching attributes.
482
+ # We use an array since we allow multiple expressions on the same attribute,
483
+ # e.g. to find an element with both class 'foo' and class 'bar'.
484
+ attrs = []
485
+ # Match the ID attribute if specified
486
+ attrs << ['id', Regexp.new('^' + parts[2] + '$')] if parts[2]
487
+ # The third part is a collection of class names, prefixed with dot
488
+ # Create an attribute matching regular expression for each class
489
+ # The class attribute is a set of space-separated names, so match accordingly
490
+ if !parts[3].empty?
491
+ parts[3].split('.').each { |cls| attrs << ['class', Regexp.new('\b' + cls + '\b')] if !cls.empty? }
492
+ end
493
+ # Process the remaining attribute expressions. Each expression is enclosed
494
+ # within square brackets, so split the expressions into anything between the
495
+ # square brackets. The result may include empty elements, skip those.
496
+ parts[4].split(/\[|\]/).each do |expr|
497
+ if not expr.empty?
498
+ # Parse the attribute expression and created a regular expression
499
+ # for matching the attribute value, based on the operation.
500
+ name, type, value = ATTR_REGEX.match(expr)[1..3]
501
+ case type
502
+ when '=' then
503
+ # Match the attribute value in full
504
+ match = Regexp.new('^' + value + '$')
505
+ when '~=' then
506
+ # Match a space-separated word within the attribute value
507
+ match = Regexp.new('\b' + value + '\b')
508
+ when '|=' then
509
+ # Match the beginning of the attribute value
510
+ match = Regexp.new('^' + value)
511
+ else
512
+ # Match all attributes values (existence check)
513
+ match = Regexp.new('')
514
+ end
515
+ attrs << [name, match]
516
+ end
517
+ end
518
+ # If there's nothing else in the statement, return this selector.
519
+ selector = Selector.new(tag_name, attrs, alt, &depends)
520
+ return selector if parts[5].empty?
521
+
522
+ # Create a compound selector based on the remainder of the statement.
523
+ # This is also why we need the factory and can't call new directly.
524
+ return case parts[5][0]
525
+ when ?,
526
+ # Alternative selector: second statement is alternative to the first one
527
+ Selector.create(parts[5][1..-1], selector)
528
+ when ?+
529
+ # Sibling selector: second statement is returned that will match node
530
+ # followed by previous sibling node based on first statement
531
+ Selector.create(parts[5][1..-1]) do |node|
532
+ node.previous_element and selector.match(node.previous_element)
533
+ end
534
+ when ?>
535
+ # Child selector: second statement is returned that will match node
536
+ # followed by parent node based on the first statement
537
+ Selector.create(parts[5][1..-1]) do |node|
538
+ node.parent? and selector.match(node.parent)
539
+ end
540
+ else
541
+ # Descendant selector: second statement is returned that will match node
542
+ # followed by ascendant node based on the first statement
543
+ Selector.create(parts[5]) do |node|
544
+ parent = node.parent
545
+ match = false
546
+ while parent
547
+ break if match = selector.match(parent)
548
+ parent = parent.parent
549
+ end
550
+ match
551
+ end
552
+ end
553
+ end
554
+
555
+ # Creates a new selector for the given class name.
556
+ def Selector.for_class(cls)
557
+ Selector.new(nil, [["class", Regexp.new('\b' + cls + '\b')]])
558
+ end
559
+
560
+ # Identifies all matching nodes.
561
+ def match(node)
562
+ # Match node if no element name or element name same as node name
563
+ match = (!@tag_name or @tag_name == node.name)
564
+ # No match if one of the attribute matches failed
565
+ for attr in @attrs
566
+ if attr[1] !~ node.attributes[attr[0]]
567
+ match = false
568
+ break
569
+ end
570
+ end
571
+ # If the node did not match, but we have an alternative match
572
+ # (x+y), apply the alternative match instead
573
+ return @alt.match(node) if not match and @alt
574
+ # If the node did match, but depends on another match (parent,
575
+ # sibling, etc), apply the dependent match as well
576
+ return @depends.call(node) if match and @depends
577
+ match
578
+ end
579
+
580
+ def inspect
581
+ stmt = @tag_name ? @tag_name : '';
582
+ @attrs.each do |attr|
583
+ stmt += "[#{attr[0]}"
584
+ stmt += "~=#{$1}" if attr[1].source =~ /^\\b(.*)\\b$/
585
+ stmt += "=#{$1}" if attr[1].source =~ /^\^(.*)\$$/
586
+ stmt += "|=#{$1}" if attr[1].source =~ /^\^[^$]*$/
587
+ end
588
+ stmt += ']'
589
+ stmt += ',' + @alt.inspect if @alt
590
+ stmt
591
+ end
592
+
593
+ end
594
+
595
+
596
+ # Implements an extractor using a simple expression format.
597
+ #
598
+ # For more information see MicroformatParser.extractor.
599
+ class Extractor
600
+
601
+ # :stopdoc:
602
+
603
+ # Parse each extractor into three parts:
604
+ # $1 function name (excluding parentheses)
605
+ # $2 element name
606
+ # $3 attribute name (including leading @)
607
+ # If a match is found the result is either $1, or $2 and/or $3
608
+ REGEX = /^(\w+)\(\)|([A-Za-z][A-Za-z0-9_\-:]*)?(@[A-Za-z][A-Za-z0-9_\-:]*)?$/
609
+
610
+ # :startdoc:
611
+
612
+ def initialize(context, statement)
613
+ statement.strip!
614
+ @extracts = []
615
+ # Break the statement into multiple extraction rules, separated by |.
616
+ statement.split('|').each do |extract|
617
+ parts = REGEX.match(extract)
618
+ if parts[1] then
619
+ # Function. Find a method in the context object (the rule class),
620
+ # report an error is not found.
621
+ begin
622
+ @extracts << context.method(parts[1]) # context.
623
+ rescue NameError=>error
624
+ raise InvalidExtractorException, error.message, error.backtrace
625
+ end
626
+ elsif parts[2] and parts[3]
627
+ # Apply only if element of this type, and extract the named attribute.
628
+ attr_name = parts[3][1..-1]
629
+ @extracts << proc { |node| node.attributes[attr_name] if node.name == parts[2] }
630
+ elsif parts[2]
631
+ # Apply only if element of this type, and extract the text value.
632
+ @extracts << proc { |node| text(node) if node.name == parts[2] }
633
+ elsif parts[3]
634
+ # Extract the named attribute.
635
+ attr_name = parts[3][1..-1]
636
+ @extracts << proc { |node| node.attributes[attr_name] }
637
+ else
638
+ raise InvalidExtractorException, "Invalid extraction statement"
639
+ end
640
+ end
641
+ raise InvalidExtractorException, "Invalid (empty) extraction statement" if @extracts.size == 0
642
+ end
643
+
644
+ public
645
+ # Extracts a value from the node based on the extractor expression.
646
+ def extract(node)
647
+ # Iterate over all extraction rules, returning the first value.
648
+ value = nil
649
+ @extracts.each do |extract|
650
+ value = extract.call(node)
651
+ break if value
652
+ end
653
+ value
654
+ end
655
+
656
+ def inspect
657
+ @extracts.join('|')
658
+ end
659
+
660
+ end
661
+
662
+ DEFAULT_EXTRACTOR = Extractor.new(self, "abbr@title|a@href|text()")
663
+
664
+ # Base class for InvalidSelectorException and InvalidExtractorException.
665
+ # Also raised when a rule is defined with invalid arguments.
666
+ class InvalidRuleException < Exception
667
+ end
668
+
669
+ # Raised to indicate an invalid selector statement.
670
+ class InvalidSelectorException < InvalidRuleException
671
+ end
672
+
673
+ # Raised to indicate an invalid extractor statement.
674
+ class InvalidExtractorException < InvalidRuleException
675
+ end
676
+
677
+
678
+ end
679
+
680
+
681
+ # A parser for several microformats.
682
+ #
683
+ # Defines rules for the following attributes:
684
+ # tags:: A list of tags based on relTag (array of String)
685
+ # events:: A list of events based on hCalendar (array of HCalendar)
686
+ #
687
+ # For example:
688
+ # content = Microformats.parse(doc)
689
+ # puts "Tagged with " + content.tags.join(", ") if content.tags
690
+ class Microformats
691
+
692
+ include MicroformatParser
693
+
694
+ # Parses the fields of an hCalendar element.
695
+ #
696
+ # Defines rules for the following attributes:
697
+ # dtstart:: The event's start date/time (a Time object)
698
+ # dtend:: The event's end date/time (a Time object)
699
+ # summary:: The event's summary (text value)
700
+ # description:: The event's description (XML node)
701
+ # url:: The event's URL (string)
702
+ #
703
+ # For example:
704
+ # content = Microformats.parse(doc)
705
+ # content.events.each do |event}
706
+ # put "Event on " + event.dtstart
707
+ # end
708
+ class HCalendar
709
+
710
+ include MicroformatParser
711
+
712
+ extractor :dt_extractor do |node|
713
+ value = node.attributes['title'] if node.name == 'abbr'
714
+ value = text(node) unless value
715
+ value ? Time.parse(value) : nil
716
+ end
717
+
718
+ rule_1 :dtstart, nil, :dt_extractor
719
+ rule_1 :dtend, nil, :dt_extractor
720
+ rule_1 :summary, nil, :text
721
+ rule_1 :description, nil, :xml
722
+ rule_1 :url, nil, "a@href"
723
+ rule_1 :location, nil, :xml
724
+ rule_1 :contact, nil, :xml
725
+
726
+ end
727
+
728
+ rule :tags, "a[rel~=tag]", "text()"
729
+ rule :events, ".vevent", HCalendar
730
+
731
+ end
metadata ADDED
@@ -0,0 +1,50 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.3
3
+ specification_version: 1
4
+ name: uformatparser
5
+ version: !ruby/object:Gem::Version
6
+ version: 1.0.0
7
+ date: 2005-11-20
8
+ summary: Microformat parser for extracting microcontent from (X)HTML
9
+ require_paths:
10
+ - lib
11
+ email: assaf@labnotes.org
12
+ homepage: http://trac.labnotes.org/cgi-bin/trac.cgi/wiki/Ruby/MicroformatParser
13
+ rubyforge_project: uformatparser
14
+ description: "Parser for extracting microcontent from (X)HTML documents, in any number of
15
+ microformats. Uses a DSL for specifying the parsing rules as a set of selectors
16
+ and extractors. Supports a CSS-like selector and XPath-like extractor syntaxes
17
+ for quick and easy rule writing. Also supports reusable and compound rules,"
18
+ autorequire: uformatparser.rb
19
+ default_executable:
20
+ bindir: bin
21
+ has_rdoc: true
22
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
23
+ requirements:
24
+ -
25
+ - ">"
26
+ - !ruby/object:Gem::Version
27
+ version: 0.0.0
28
+ version:
29
+ platform: ruby
30
+ authors:
31
+ - Assaf Arkin
32
+ files:
33
+ - lib/uformatparser.rb
34
+ - README
35
+ - Rakefile
36
+ - MIT-LICENSE
37
+ test_files: []
38
+ rdoc_options:
39
+ - "--main"
40
+ - README
41
+ - "--title"
42
+ - Microformat parser
43
+ - "--line-numbers"
44
+ extra_rdoc_files:
45
+ - README
46
+ executables: []
47
+ extensions: []
48
+ requirements:
49
+ - ReXML. HTML->ReXML parser.
50
+ dependencies: []