uformatparser 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +20 -0
- data/README +44 -0
- data/Rakefile +74 -0
- data/lib/uformatparser.rb +731 -0
- metadata +50 -0
data/MIT-LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2005 Assaf Arkin
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
= Microformat Parser
|
2
|
+
|
3
|
+
MicroformatParser is a Ruby module for creating microformat parsers.
|
4
|
+
A microformat parser is a class with a set of rules for extracting
|
5
|
+
interesting content from (X)HTML documents. You create your own parser
|
6
|
+
by writing a class with a set of rules. The magic happens in the parse
|
7
|
+
method which taks an (X)HTML document or element, runs all the rules
|
8
|
+
on it, and returns new object that holds the extracted valus.
|
9
|
+
|
10
|
+
Here's a simple example to find all links and all tags in a document:
|
11
|
+
|
12
|
+
class MyParser
|
13
|
+
include MicroformatParser
|
14
|
+
|
15
|
+
rule :links, "a", "a@href"
|
16
|
+
rule :tags, "a[rel~=tag]", "text()"
|
17
|
+
end
|
18
|
+
|
19
|
+
content = MyParser.parse(doc)
|
20
|
+
puts "Found " + content.links.size + " links" if content.links
|
21
|
+
puts "Tagged with " + content.tags.join(', ') if content.tags
|
22
|
+
|
23
|
+
|
24
|
+
== Documentation
|
25
|
+
|
26
|
+
You may want to read the documentation for a more details discussion of
|
27
|
+
selectors, extractors, compound rules, (X)HTML parsing and examples
|
28
|
+
|
29
|
+
http://trac.labnotes.org/cgi-bin/trac.cgi/wiki/Ruby/MicroformatParser
|
30
|
+
|
31
|
+
|
32
|
+
== Download
|
33
|
+
|
34
|
+
The latest version of can be found at
|
35
|
+
|
36
|
+
http://rubyforge.org/projects/uformatparser/
|
37
|
+
|
38
|
+
== License
|
39
|
+
|
40
|
+
This package is licensed under the MIT license and/or the {Creative
|
41
|
+
Commons Attribution-ShareAlike}[http://creativecommons.org/licenses/by-sa/2.5/legalcode].
|
42
|
+
|
43
|
+
:include: MIT-LICENSE
|
44
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
# Adapted from the rake Rakefile.
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
Gem::manage_gems
|
5
|
+
require 'rake/testtask'
|
6
|
+
require 'rake/rdoctask'
|
7
|
+
require 'rake/gempackagetask'
|
8
|
+
|
9
|
+
|
10
|
+
desc "Default Task"
|
11
|
+
task :default => [:tests, :rdoc]
|
12
|
+
|
13
|
+
|
14
|
+
Rake::TestTask.new :tests do |test|
|
15
|
+
test.verbose = true
|
16
|
+
test.test_files = ['test/*.rb']
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
# Create the documentation.
|
21
|
+
Rake::RDocTask.new do |rdoc|
|
22
|
+
rdoc.main = "README"
|
23
|
+
rdoc.rdoc_files.include("README", "lib/**/*.rb")
|
24
|
+
rdoc.title = 'Microformat Parser'
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
# Create the GEM package.
|
29
|
+
gem_spec = Gem::Specification.new do |spec|
|
30
|
+
spec.name = 'uformatparser'
|
31
|
+
spec.version = "1.0.0"
|
32
|
+
spec.summary = "Microformat parser for extracting microcontent from (X)HTML"
|
33
|
+
spec.description = <<-EOF
|
34
|
+
Parser for extracting microcontent from (X)HTML documents, in any number
|
35
|
+
of microformats.
|
36
|
+
|
37
|
+
Uses a DSL for specifying the parsing rules as a set of selectors and
|
38
|
+
extractors. Supports a CSS-like selector and XPath-like extractor syntaxes
|
39
|
+
for quick and easy rule writing. Also supports reusable and compound rules,
|
40
|
+
EOF
|
41
|
+
spec.author = "Assaf Arkin"
|
42
|
+
spec.email = "assaf@labnotes.org"
|
43
|
+
spec.homepage = "http://trac.labnotes.org/cgi-bin/trac.cgi/wiki/Ruby/MicroformatParser"
|
44
|
+
|
45
|
+
spec.files = FileList["{test,lib}/**/*", "README", "Rakefile", "MIT-LICENSE"].to_a
|
46
|
+
spec.require_path = "lib"
|
47
|
+
spec.autorequire = 'uformatparser.rb'
|
48
|
+
spec.requirements << "ReXML. HTML->ReXML parser."
|
49
|
+
spec.has_rdoc = true
|
50
|
+
spec.rdoc_options << '--main' << 'README' << '--title' << 'Microformat parser' << '--line-numbers'
|
51
|
+
spec.extra_rdoc_files = ["README"]
|
52
|
+
spec.rubyforge_project = "uformatparser"
|
53
|
+
end
|
54
|
+
|
55
|
+
gem = Rake::GemPackageTask.new(gem_spec) do |pkg|
|
56
|
+
pkg.need_tar = true
|
57
|
+
pkg.need_zip = true
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
# --------------------------------------------------------------------
|
62
|
+
# Creating a release
|
63
|
+
|
64
|
+
desc "Make a new release"
|
65
|
+
task :release => [:tests, :clobber, :package] do
|
66
|
+
puts
|
67
|
+
puts "**************************************************************"
|
68
|
+
puts "* Release #{gem_spec.version} Complete."
|
69
|
+
puts "* Packages ready to upload."
|
70
|
+
puts "**************************************************************"
|
71
|
+
puts
|
72
|
+
end
|
73
|
+
|
74
|
+
|
@@ -0,0 +1,731 @@
|
|
1
|
+
#
|
2
|
+
# = uformatparser.rb - Microformat parser
|
3
|
+
#
|
4
|
+
#--
|
5
|
+
# Author:: Assaf Arkin assaf@labnotes.org
|
6
|
+
# Documentation:: http://trac.labnotes.org/cgi-bin/trac.cgi/wiki/Ruby/MicroformatParser
|
7
|
+
# Copyright:: Copyright (c) 2005 Assaf Arkin
|
8
|
+
# License:: Creative Commons Attribution-ShareAlike
|
9
|
+
#
|
10
|
+
#++
|
11
|
+
|
12
|
+
require 'rexml/document'
|
13
|
+
|
14
|
+
|
15
|
+
# Implements a microformat parser by extending a class that includes this module.
|
16
|
+
#
|
17
|
+
# === The Basics
|
18
|
+
#
|
19
|
+
# To create a microformat parser, extend a class with this module and use the
|
20
|
+
# +rule+ method to define parsing rules for that class. Call +parse+ to parse the
|
21
|
+
# content, returning a new instance of the class holding all values extracted from
|
22
|
+
# parsing. You can parse a document or an element.
|
23
|
+
#
|
24
|
+
# For example:
|
25
|
+
# class Microformats
|
26
|
+
# include MicroformatParser
|
27
|
+
#
|
28
|
+
# class HCalendar
|
29
|
+
# include MicroformatParser
|
30
|
+
#
|
31
|
+
# # Extract ISO date/time
|
32
|
+
# extractor :dt_extractor do |node|
|
33
|
+
# value = node.attributes['title'] if node.name == 'abbr'
|
34
|
+
# value = text(node) unless value
|
35
|
+
# value ? Time.parse(value) : nil
|
36
|
+
# end
|
37
|
+
#
|
38
|
+
# rule_1 :dtstart, nil, :dt_extractor
|
39
|
+
# rule_1 :dtend, nil, :dt_extractor
|
40
|
+
# rule_1 :summary, nil, :text
|
41
|
+
# rule_1 :description, nil, :xml
|
42
|
+
# rule_1 :url, nil, "a@href"
|
43
|
+
# end
|
44
|
+
#
|
45
|
+
# rule :tags, "a[rel~=tag]", "text()"
|
46
|
+
# rule :events, ".vevent", HCalendar
|
47
|
+
# end
|
48
|
+
#
|
49
|
+
# content = Microformats.parse(doc)
|
50
|
+
# puts content.tags
|
51
|
+
# puts content.events
|
52
|
+
#
|
53
|
+
module MicroformatParser
|
54
|
+
|
55
|
+
|
56
|
+
# Create a new rule.
|
57
|
+
#
|
58
|
+
# There are two ways to define a rule:
|
59
|
+
# * rule name, selector?, extractor?, limit?
|
60
|
+
# * rule name, limit? { block }
|
61
|
+
#
|
62
|
+
# The +name+ argument specifies an instance variable that holds the value
|
63
|
+
# (or values) extracted from processing this rule. It can be a string or
|
64
|
+
# a symbol. An attribute accessor is created with that name.
|
65
|
+
#
|
66
|
+
# The +selector+ argument identifies all nodes that match the rule. It can
|
67
|
+
# be an CSS-style selector (string) or a method/proc. A symbol specifies
|
68
|
+
# a method to use from this class. The method/proc receives a single argument
|
69
|
+
# with the node and must return true/false.
|
70
|
+
#
|
71
|
+
# If selector is absent, the default selector will match any element with
|
72
|
+
# a class of the same name as the name argument. For example:
|
73
|
+
# rule :dtstart
|
74
|
+
# Matches all elements with the class _dtstart_.
|
75
|
+
#
|
76
|
+
# The +extractor+ argument specifies how to extract a value from a selected
|
77
|
+
# node. It can be a list of extract rules (string), a method/proc, or a class.
|
78
|
+
# A symbol specifies a method to use from this class. The method/proce receives
|
79
|
+
# a single argument with the node and returns the extracted value, or nil.
|
80
|
+
#
|
81
|
+
# If the extractor is a class, it references a microformat parser which is
|
82
|
+
# then called to parse the content of a matching element.
|
83
|
+
#
|
84
|
+
# If extractor is absent, the default extractor is used:
|
85
|
+
# abbr@title|a@href|text()
|
86
|
+
#
|
87
|
+
# The +limit+ argument specifies the cardinality of the rule's value:
|
88
|
+
# 0 The rule is never applied
|
89
|
+
# 1 The rule is applied once, the first extracted value is set
|
90
|
+
# -1 The rule is applied multiple times, extracted values are set in an array
|
91
|
+
# n The rule is applied up to _n_ times, extracted values are set in an array
|
92
|
+
#
|
93
|
+
# In the second form, a block is specified instead of the selector/extractor.
|
94
|
+
# The block is called with a node and returns the extracted value, or nil.
|
95
|
+
def rule(name, selector = nil, extractor = nil, limit = -1, &proc)
|
96
|
+
raise InvalidRuleException, "First argument (rule name) is required" unless name
|
97
|
+
if proc
|
98
|
+
# The rule processing is taken from the block, everything else must be nil
|
99
|
+
raise InvalidRuleException, "Can't specify selector/extractor in combination with proc" if selector or extractor
|
100
|
+
rule = Rule.new(name, nil, proc, limit)
|
101
|
+
else
|
102
|
+
# Determine the selector.
|
103
|
+
selector = case selector
|
104
|
+
when NilClass
|
105
|
+
# Absent selector: create a selector that matches element with the same
|
106
|
+
# class as the rule name
|
107
|
+
match = Regexp.new("\\b#{name.to_s}\\b")
|
108
|
+
proc { |node| node.attributes['class'] =~ match }
|
109
|
+
when String
|
110
|
+
# CSS-style selector
|
111
|
+
Selector.create(selector)
|
112
|
+
when Proc, Method
|
113
|
+
# Use as is
|
114
|
+
selector
|
115
|
+
when Symbol
|
116
|
+
# Find named method and use that as the selector
|
117
|
+
# Since the instance method is unbound, we bind it to this class
|
118
|
+
selector = method(selector)
|
119
|
+
raise InvalidSelectorException, "Method #{name.to_s} is not a valid selector" unless selector
|
120
|
+
selector
|
121
|
+
else
|
122
|
+
raise InvalidSelectorException, "Invalid selector type: must be a string, symbol, proc/method or nil"
|
123
|
+
end
|
124
|
+
|
125
|
+
# Determine the extractor
|
126
|
+
extractor = case extractor
|
127
|
+
when NilClass
|
128
|
+
# Absent extractor: either block if provided, otherwise default extractor
|
129
|
+
default_extractor
|
130
|
+
when String
|
131
|
+
# Extractor expression
|
132
|
+
Extractor.new(self, extractor)
|
133
|
+
when Proc, Method
|
134
|
+
# Use as is
|
135
|
+
extractor
|
136
|
+
when Symbol
|
137
|
+
# Find named method and use that as the extractor
|
138
|
+
# Since the instance method is unbound, we bind it to this class
|
139
|
+
extractor = method(extractor)
|
140
|
+
raise InvalidExtractorException, "Method #{name.to_s} is not a valid extractor" unless extractor
|
141
|
+
extractor
|
142
|
+
when Class
|
143
|
+
# Extractor is a class, generally another ruleset, so we call
|
144
|
+
# its parse method (must exist).
|
145
|
+
begin
|
146
|
+
extractor.method(:parse)
|
147
|
+
rescue NameError=>error
|
148
|
+
raise InvalidExtractorException, "Extractor class must implement the method parse", error.backtrace
|
149
|
+
end
|
150
|
+
extractor
|
151
|
+
else
|
152
|
+
raise InvalidExtractorException, "Invalid extractor type: must be a string, parser class, block or nil"
|
153
|
+
end
|
154
|
+
|
155
|
+
# Create a new rule, to invoke its process method
|
156
|
+
rule = Rule.new(name, selector, extractor, limit)
|
157
|
+
end
|
158
|
+
|
159
|
+
# Create an accessor for an attribute with the same name as the rule
|
160
|
+
# The accessor will hold the rule value
|
161
|
+
attr name, true
|
162
|
+
# Add this rule to class's ruleset
|
163
|
+
self.rules << rule
|
164
|
+
end
|
165
|
+
|
166
|
+
|
167
|
+
# Create a new rule that extracts at most one value.
|
168
|
+
#
|
169
|
+
# Same as calling +rule+ with +limit+=1
|
170
|
+
def rule_1(name, selector = nil, extractor = nil, &proc)
|
171
|
+
# Rule with limit of one value
|
172
|
+
rule(name, selector, extractor, 1, &proc)
|
173
|
+
end
|
174
|
+
|
175
|
+
|
176
|
+
# Creates a new selector.
|
177
|
+
#
|
178
|
+
# There are two ways to create a selector:
|
179
|
+
# * selector name, statement
|
180
|
+
# * selector name { block }
|
181
|
+
#
|
182
|
+
# The +name+ argument (a string or symbol) specifies the selector name,
|
183
|
+
# defining a class method with that name that can be used to identify matching
|
184
|
+
# element.
|
185
|
+
#
|
186
|
+
# The selector can be a CSS-style selector (string) or a block that accepts a
|
187
|
+
# single argument (element) and returns true or false.
|
188
|
+
#
|
189
|
+
# For example:
|
190
|
+
# selector :select_link { |node| node.name == 'a' }
|
191
|
+
# extractor :extract_link { |node| node.attributes['href'] }
|
192
|
+
# rule :links, :select_link, :extract_link
|
193
|
+
def selector(name, selector = nil, &proc)
|
194
|
+
raise InvalidSelectorException, "First argument (rule name) is required" unless name
|
195
|
+
selector = case selector
|
196
|
+
when NilClass
|
197
|
+
# Absent selector: either block is provided, or we create a selector
|
198
|
+
# that matches element with the same class as the selector name
|
199
|
+
if proc
|
200
|
+
proc
|
201
|
+
else
|
202
|
+
match = Regexp.new("\\b#{name.to_s}\\b")
|
203
|
+
proc { |node| node.attributes['class'] =~ match }
|
204
|
+
end
|
205
|
+
when String
|
206
|
+
# CSS-style selector
|
207
|
+
Selector.create(selector)
|
208
|
+
else
|
209
|
+
raise InvalidSelectorException, "Invalid selector type: must be a string, block or nil"
|
210
|
+
end
|
211
|
+
# Create a class method using the selector name that calls the
|
212
|
+
# selector's match method.
|
213
|
+
class << self
|
214
|
+
self
|
215
|
+
end.instance_eval { define_method(name) { |node| selector.call(node) } }
|
216
|
+
end
|
217
|
+
|
218
|
+
|
219
|
+
# Creates a new extractor.
|
220
|
+
#
|
221
|
+
# There are two ways to create an extractor:
|
222
|
+
# * extractor name, statement
|
223
|
+
# * extractor selector name { block }
|
224
|
+
#
|
225
|
+
# The +name+ argument (string or symbol) specifies the extractor name,
|
226
|
+
# defining a class method with that name that can be used to extract the
|
227
|
+
# value of a node.
|
228
|
+
#
|
229
|
+
# The extractor can be an expression (string) or a block that accepts a
|
230
|
+
# single argument (element) and returns the extracted value, or nil.
|
231
|
+
#
|
232
|
+
# For example:
|
233
|
+
# selector :select_link { |node| node.name == 'a' }
|
234
|
+
# extractor :extract_link { |node| node.attributes['href'] }
|
235
|
+
# rule :links, :select_link, :extract_link
|
236
|
+
#
|
237
|
+
# The expression takes the form of:
|
238
|
+
# extractor := extract (|extract)*
|
239
|
+
# extract := element | @attribute | element@attribute | method()
|
240
|
+
#
|
241
|
+
# If multiple extracts are specified, the first extracted value is used.
|
242
|
+
#
|
243
|
+
# If an element is specified, the text value is extracted only if the selected
|
244
|
+
# node is an element of that type. If an attribute is specified, the extracted
|
245
|
+
# value is the attribute's value. If both element and attribute are used, the
|
246
|
+
# attribute value is extracted only if the selected node is an element of that
|
247
|
+
# type.
|
248
|
+
#
|
249
|
+
# If a method is specified, that method is called for the node. There are two
|
250
|
+
# methods available in any class: +text+ and +xml+.
|
251
|
+
def extractor(name, extractor = nil, &proc)
|
252
|
+
raise InvalidExtractorException, "First argument (rule name) is required" unless name
|
253
|
+
extractor = case extractor
|
254
|
+
when NilClass
|
255
|
+
# Absent extractor: either block if provided, otherwise default extractor
|
256
|
+
proc ? proc : default_extractor
|
257
|
+
when String
|
258
|
+
# Extractor expression
|
259
|
+
Extractor.new(extractor)
|
260
|
+
else
|
261
|
+
raise InvalidExtractorException, "Invalid extractor type: must be a string, parser class, block or nil"
|
262
|
+
end
|
263
|
+
# Create a class method using the extractor name that calls the
|
264
|
+
# extractor's extract method.
|
265
|
+
class << self
|
266
|
+
self
|
267
|
+
end.instance_eval { define_method(name) { |node| extractor.call(node) } }
|
268
|
+
end
|
269
|
+
|
270
|
+
# Returns the default extractor.
|
271
|
+
def default_extractor()
|
272
|
+
return DEFAULT_EXTRACTOR
|
273
|
+
end
|
274
|
+
|
275
|
+
# Called to parse a node.
|
276
|
+
#
|
277
|
+
# The node may be an element (REXML::Element) or a document (REXML::Document).
|
278
|
+
#
|
279
|
+
# For example:
|
280
|
+
# class ParseLinks
|
281
|
+
# include MicroformatParser
|
282
|
+
#
|
283
|
+
# rule :links, "a", "@href"
|
284
|
+
# rule :ids, "a[@id]", "a@id"
|
285
|
+
# end
|
286
|
+
#
|
287
|
+
# parsed = ParseLinks.parse(doc)
|
288
|
+
# puts parsed.links
|
289
|
+
# puts parsed.ids
|
290
|
+
def parse(node, context = nil, rules = nil)
|
291
|
+
# Create a new object unless one is provided. This method can be
|
292
|
+
# called on the class (creating a new instance) or on an object (recursive)
|
293
|
+
context = self.new() unless context
|
294
|
+
# Obtain the rules for this class unless provided by caller.
|
295
|
+
rules = self.rules unless rules
|
296
|
+
# Rules are reduced during processing. If a rule matches a node, that rule
|
297
|
+
# is not applied to any child nodes (structured rules will process child nodes
|
298
|
+
# directly). However, other rules are allowed to process the child nodes.
|
299
|
+
# Removing a rule modifies the ruleset, requiring it to be cloned.
|
300
|
+
less_rules = nil
|
301
|
+
# We must have rules and the node must be an element/document
|
302
|
+
if rules and node.kind_of? REXML::Element
|
303
|
+
# Iterate over all the rules and process them. Remove any matching rules
|
304
|
+
# from this ruleset -- the new ruleset will be used on child nodes.
|
305
|
+
rules.each_with_index do |rule, index|
|
306
|
+
if rule and rule.process(node, context)
|
307
|
+
less_rules = rules.clone unless less_rules
|
308
|
+
less_rules[index] = nil
|
309
|
+
end
|
310
|
+
end
|
311
|
+
rules = less_rules if less_rules
|
312
|
+
node.elements.each { |child| parse(child, context, less_rules) }
|
313
|
+
end
|
314
|
+
context
|
315
|
+
end
|
316
|
+
|
317
|
+
# Returns all the rules for this class.
|
318
|
+
#
|
319
|
+
# Returns an array of rules defined with +rule+.
|
320
|
+
#
|
321
|
+
# You can use this method to inspect rules, add/remove rules, etc. Rules are
|
322
|
+
# processed in the order in which they are added.
|
323
|
+
def rules
|
324
|
+
rules = @microparser_rules
|
325
|
+
@microparser_rules = rules = Array.new() unless rules
|
326
|
+
rules
|
327
|
+
end
|
328
|
+
|
329
|
+
# Returns the text value of a node.
|
330
|
+
def text(node)
|
331
|
+
value = ''
|
332
|
+
for child in node.children
|
333
|
+
if child.instance_of? REXML::Text
|
334
|
+
value += child.value
|
335
|
+
elsif child.instance_of? REXML::Element
|
336
|
+
value += text(child)
|
337
|
+
end
|
338
|
+
end
|
339
|
+
value
|
340
|
+
end
|
341
|
+
|
342
|
+
# Returns the XML value of a node (the node itself).
|
343
|
+
def xml(node)
|
344
|
+
node
|
345
|
+
end
|
346
|
+
|
347
|
+
module_function :text, :xml
|
348
|
+
|
349
|
+
|
350
|
+
|
351
|
+
private
|
352
|
+
|
353
|
+
def self.included(mod)
|
354
|
+
mod.extend(self)
|
355
|
+
end
|
356
|
+
|
357
|
+
|
358
|
+
# Implements a rule.
|
359
|
+
#
|
360
|
+
# A rule identifies matching nodes using a selector, and a means to extract their value
|
361
|
+
# using an extractor. The rule also identifies an instance variable and attribute accessor
|
362
|
+
# to retrieve the extracted value, and the cardinality of that value.
|
363
|
+
#
|
364
|
+
# For more information see MicroformatParser.rule.
|
365
|
+
class Rule
|
366
|
+
|
367
|
+
# The instance variable/attribute accessor name.
|
368
|
+
attr :name
|
369
|
+
# The rule cardinality (or value limit)
|
370
|
+
# 0:: No value (disabled)
|
371
|
+
# 1:: First value extracted
|
372
|
+
# n:: Up to n values (array)
|
373
|
+
# -1:: Unbound (array)
|
374
|
+
attr :limit,true
|
375
|
+
# The rule selector
|
376
|
+
attr :selector
|
377
|
+
# The rule extractor
|
378
|
+
attr :extractor
|
379
|
+
|
380
|
+
def initialize(name, selector, extractor, limit)
|
381
|
+
# Change the rule name to the attribute name holding the result
|
382
|
+
@name = "@#{name.to_s}".to_sym
|
383
|
+
@selector = selector
|
384
|
+
@extractor = extractor
|
385
|
+
@limit = limit
|
386
|
+
end
|
387
|
+
|
388
|
+
# Called to process this rule on a node with a context object.
|
389
|
+
#
|
390
|
+
# Returns true if the rule was processed and should be reduced (not applied to
|
391
|
+
# any child nodes). Otherwise, returns false.
|
392
|
+
def process(node, context)
|
393
|
+
# Do nothing if rule is disabled (limit = 0), reduce it.
|
394
|
+
return true if @limit == 0
|
395
|
+
# Do nothing if rule is singular (limit = 1) and a value was already set
|
396
|
+
current = context.instance_variable_get(@name)
|
397
|
+
return true if @limit == 1 and current
|
398
|
+
# Match the current node, do nothing if not matched
|
399
|
+
# (Selector may be nil if rule created to wrap a proc)
|
400
|
+
if @selector
|
401
|
+
return false unless @selector.instance_of?(UnboundMethod) ? @selector.bind(context).call(node) :
|
402
|
+
@selector.instance_of?(Selector) ? @selector.match(node) : @selector.call(node)
|
403
|
+
end
|
404
|
+
# Extract the value. Do nothing if nothing extracted
|
405
|
+
value = case @extractor
|
406
|
+
when UnboundMethod
|
407
|
+
@extractor.bind(context).call(node)
|
408
|
+
when Extractor
|
409
|
+
@extractor.extract(node)
|
410
|
+
when Proc, Method
|
411
|
+
@extractor.call(node)
|
412
|
+
when Class
|
413
|
+
@extractor.parse(node)
|
414
|
+
end
|
415
|
+
return false unless value
|
416
|
+
# If limit=1, set the new value (singular)
|
417
|
+
# If no current value, create new array with new value
|
418
|
+
# Otherwise, if no limit or limit not reach, append value to
|
419
|
+
# existing array
|
420
|
+
if @limit == 1
|
421
|
+
context.instance_variable_set(@name, value)
|
422
|
+
elsif not current
|
423
|
+
context.instance_variable_set(@name, [value])
|
424
|
+
elsif current.instance_of? Array and (@limit < 0 or current.size < @limit)
|
425
|
+
current << value
|
426
|
+
end
|
427
|
+
# We always return true, since there's no point in applying
|
428
|
+
# the rule to any child nodes.
|
429
|
+
return true
|
430
|
+
end
|
431
|
+
|
432
|
+
def inspect
|
433
|
+
@selector ? "[to #{@name} from #{@selector.inspect}, #{@extractor.inspect}, limit #{@limit}]" : "[to #{@name} from #{@extractor.inspect}, limit #{@limit}]"
|
434
|
+
end
|
435
|
+
|
436
|
+
end
|
437
|
+
|
438
|
+
|
439
|
+
# Implements a selector using a CSS-style expression.
|
440
|
+
#
|
441
|
+
# For more information see MicroformatParser.selector.
|
442
|
+
class Selector
|
443
|
+
|
444
|
+
# :stopdoc:
|
445
|
+
|
446
|
+
# Parse each selector into five parts:
|
447
|
+
# $1 element name or * (optional)
|
448
|
+
# $2 ID name (including leading #, optional)
|
449
|
+
# $3 class names (including leading ., zero or more)
|
450
|
+
# $4 attribute expressions (zero or more)
|
451
|
+
# $5 anything else (no leading spaces)
|
452
|
+
REGEX = /^(\*|[A-Za-z][A-Za-z0-9_\-:]*)?(#[A-Za-z][A-Za-z0-9_\-:]*)?((?:\.[A-Za-z][A-Za-z0-9_\-:]*){0,})((?:\[[A-Za-z][A-Za-z0-9_\-:]*(?:(?:~|\|)?=.*)?\]){0,})\s*(.*)$/
|
453
|
+
|
454
|
+
# Parse each attribute expression into three parts:
|
455
|
+
# $1 attribute name
|
456
|
+
# $2 matching operation
|
457
|
+
# $3 matched value
|
458
|
+
# Matching operation may be =, ~= or |=. Value may be empty.
|
459
|
+
ATTR_REGEX = /^([A-Za-z][A-Za-z0-9_\-:]*)((?:~|\|)?=)?(.*)$/
|
460
|
+
|
461
|
+
# :startdoc:
|
462
|
+
|
463
|
+
def initialize(tag_name, attrs, alt = nil, &depends)
|
464
|
+
@tag_name = tag_name
|
465
|
+
@attrs = attrs
|
466
|
+
@alt = alt
|
467
|
+
@depends = depends
|
468
|
+
end
|
469
|
+
|
470
|
+
|
471
|
+
public
|
472
|
+
# Creates a new selector.
|
473
|
+
def Selector.create(statement, alt = nil, &depends)
|
474
|
+
statement.strip!
|
475
|
+
# Parse the first selector expression into $1-$4, anything else goes in $5
|
476
|
+
parts = REGEX.match(statement)
|
477
|
+
raise InvalidSelectorException, "Invalid (empty) selector statement" if parts[0].length == 0
|
478
|
+
|
479
|
+
# Set tag_name to the element name if specified and not *
|
480
|
+
tag_name = parts[1] if parts[1] and !parts[1].empty? and parts[1] != '*'
|
481
|
+
# This array holds the regular expressions for matching attributes.
|
482
|
+
# We use an array since we allow multiple expressions on the same attribute,
|
483
|
+
# e.g. to find an element with both class 'foo' and class 'bar'.
|
484
|
+
attrs = []
|
485
|
+
# Match the ID attribute if specified
|
486
|
+
attrs << ['id', Regexp.new('^' + parts[2] + '$')] if parts[2]
|
487
|
+
# The third part is a collection of class names, prefixed with dot
|
488
|
+
# Create an attribute matching regular expression for each class
|
489
|
+
# The class attribute is a set of space-separated names, so match accordingly
|
490
|
+
if !parts[3].empty?
|
491
|
+
parts[3].split('.').each { |cls| attrs << ['class', Regexp.new('\b' + cls + '\b')] if !cls.empty? }
|
492
|
+
end
|
493
|
+
# Process the remaining attribute expressions. Each expression is enclosed
|
494
|
+
# within square brackets, so split the expressions into anything between the
|
495
|
+
# square brackets. The result may include empty elements, skip those.
|
496
|
+
parts[4].split(/\[|\]/).each do |expr|
|
497
|
+
if not expr.empty?
|
498
|
+
# Parse the attribute expression and created a regular expression
|
499
|
+
# for matching the attribute value, based on the operation.
|
500
|
+
name, type, value = ATTR_REGEX.match(expr)[1..3]
|
501
|
+
case type
|
502
|
+
when '=' then
|
503
|
+
# Match the attribute value in full
|
504
|
+
match = Regexp.new('^' + value + '$')
|
505
|
+
when '~=' then
|
506
|
+
# Match a space-separated word within the attribute value
|
507
|
+
match = Regexp.new('\b' + value + '\b')
|
508
|
+
when '|=' then
|
509
|
+
# Match the beginning of the attribute value
|
510
|
+
match = Regexp.new('^' + value)
|
511
|
+
else
|
512
|
+
# Match all attributes values (existence check)
|
513
|
+
match = Regexp.new('')
|
514
|
+
end
|
515
|
+
attrs << [name, match]
|
516
|
+
end
|
517
|
+
end
|
518
|
+
# If there's nothing else in the statement, return this selector.
|
519
|
+
selector = Selector.new(tag_name, attrs, alt, &depends)
|
520
|
+
return selector if parts[5].empty?
|
521
|
+
|
522
|
+
# Create a compound selector based on the remainder of the statement.
|
523
|
+
# This is also why we need the factory and can't call new directly.
|
524
|
+
return case parts[5][0]
|
525
|
+
when ?,
|
526
|
+
# Alternative selector: second statement is alternative to the first one
|
527
|
+
Selector.create(parts[5][1..-1], selector)
|
528
|
+
when ?+
|
529
|
+
# Sibling selector: second statement is returned that will match node
|
530
|
+
# followed by previous sibling node based on first statement
|
531
|
+
Selector.create(parts[5][1..-1]) do |node|
|
532
|
+
node.previous_element and selector.match(node.previous_element)
|
533
|
+
end
|
534
|
+
when ?>
|
535
|
+
# Child selector: second statement is returned that will match node
|
536
|
+
# followed by parent node based on the first statement
|
537
|
+
Selector.create(parts[5][1..-1]) do |node|
|
538
|
+
node.parent? and selector.match(node.parent)
|
539
|
+
end
|
540
|
+
else
|
541
|
+
# Descendant selector: second statement is returned that will match node
|
542
|
+
# followed by ascendant node based on the first statement
|
543
|
+
Selector.create(parts[5]) do |node|
|
544
|
+
parent = node.parent
|
545
|
+
match = false
|
546
|
+
while parent
|
547
|
+
break if match = selector.match(parent)
|
548
|
+
parent = parent.parent
|
549
|
+
end
|
550
|
+
match
|
551
|
+
end
|
552
|
+
end
|
553
|
+
end
|
554
|
+
|
555
|
+
# Creates a new selector for the given class name.
|
556
|
+
def Selector.for_class(cls)
|
557
|
+
Selector.new(nil, [["class", Regexp.new('\b' + cls + '\b')]])
|
558
|
+
end
|
559
|
+
|
560
|
+
# Identifies all matching nodes.
|
561
|
+
def match(node)
|
562
|
+
# Match node if no element name or element name same as node name
|
563
|
+
match = (!@tag_name or @tag_name == node.name)
|
564
|
+
# No match if one of the attribute matches failed
|
565
|
+
for attr in @attrs
|
566
|
+
if attr[1] !~ node.attributes[attr[0]]
|
567
|
+
match = false
|
568
|
+
break
|
569
|
+
end
|
570
|
+
end
|
571
|
+
# If the node did not match, but we have an alternative match
|
572
|
+
# (x+y), apply the alternative match instead
|
573
|
+
return @alt.match(node) if not match and @alt
|
574
|
+
# If the node did match, but depends on another match (parent,
|
575
|
+
# sibling, etc), apply the dependent match as well
|
576
|
+
return @depends.call(node) if match and @depends
|
577
|
+
match
|
578
|
+
end
|
579
|
+
|
580
|
+
def inspect
|
581
|
+
stmt = @tag_name ? @tag_name : '';
|
582
|
+
@attrs.each do |attr|
|
583
|
+
stmt += "[#{attr[0]}"
|
584
|
+
stmt += "~=#{$1}" if attr[1].source =~ /^\\b(.*)\\b$/
|
585
|
+
stmt += "=#{$1}" if attr[1].source =~ /^\^(.*)\$$/
|
586
|
+
stmt += "|=#{$1}" if attr[1].source =~ /^\^[^$]*$/
|
587
|
+
end
|
588
|
+
stmt += ']'
|
589
|
+
stmt += ',' + @alt.inspect if @alt
|
590
|
+
stmt
|
591
|
+
end
|
592
|
+
|
593
|
+
end
|
594
|
+
|
595
|
+
|
596
|
+
# Implements an extractor using a simple expression format.
|
597
|
+
#
|
598
|
+
# For more information see MicroformatParser.extractor.
|
599
|
+
class Extractor
|
600
|
+
|
601
|
+
# :stopdoc:
|
602
|
+
|
603
|
+
# Parse each extractor into three parts:
|
604
|
+
# $1 function name (excluding parentheses)
|
605
|
+
# $2 element name
|
606
|
+
# $3 attribute name (including leading @)
|
607
|
+
# If a match is found the result is either $1, or $2 and/or $3
|
608
|
+
REGEX = /^(\w+)\(\)|([A-Za-z][A-Za-z0-9_\-:]*)?(@[A-Za-z][A-Za-z0-9_\-:]*)?$/
|
609
|
+
|
610
|
+
# :startdoc:
|
611
|
+
|
612
|
+
def initialize(context, statement)
|
613
|
+
statement.strip!
|
614
|
+
@extracts = []
|
615
|
+
# Break the statement into multiple extraction rules, separated by |.
|
616
|
+
statement.split('|').each do |extract|
|
617
|
+
parts = REGEX.match(extract)
|
618
|
+
if parts[1] then
|
619
|
+
# Function. Find a method in the context object (the rule class),
|
620
|
+
# report an error is not found.
|
621
|
+
begin
|
622
|
+
@extracts << context.method(parts[1]) # context.
|
623
|
+
rescue NameError=>error
|
624
|
+
raise InvalidExtractorException, error.message, error.backtrace
|
625
|
+
end
|
626
|
+
elsif parts[2] and parts[3]
|
627
|
+
# Apply only if element of this type, and extract the named attribute.
|
628
|
+
attr_name = parts[3][1..-1]
|
629
|
+
@extracts << proc { |node| node.attributes[attr_name] if node.name == parts[2] }
|
630
|
+
elsif parts[2]
|
631
|
+
# Apply only if element of this type, and extract the text value.
|
632
|
+
@extracts << proc { |node| text(node) if node.name == parts[2] }
|
633
|
+
elsif parts[3]
|
634
|
+
# Extract the named attribute.
|
635
|
+
attr_name = parts[3][1..-1]
|
636
|
+
@extracts << proc { |node| node.attributes[attr_name] }
|
637
|
+
else
|
638
|
+
raise InvalidExtractorException, "Invalid extraction statement"
|
639
|
+
end
|
640
|
+
end
|
641
|
+
raise InvalidExtractorException, "Invalid (empty) extraction statement" if @extracts.size == 0
|
642
|
+
end
|
643
|
+
|
644
|
+
public
|
645
|
+
# Extracts a value from the node based on the extractor expression.
|
646
|
+
def extract(node)
|
647
|
+
# Iterate over all extraction rules, returning the first value.
|
648
|
+
value = nil
|
649
|
+
@extracts.each do |extract|
|
650
|
+
value = extract.call(node)
|
651
|
+
break if value
|
652
|
+
end
|
653
|
+
value
|
654
|
+
end
|
655
|
+
|
656
|
+
def inspect
|
657
|
+
@extracts.join('|')
|
658
|
+
end
|
659
|
+
|
660
|
+
end
|
661
|
+
|
662
|
+
DEFAULT_EXTRACTOR = Extractor.new(self, "abbr@title|a@href|text()")
|
663
|
+
|
664
|
+
# Base class for InvalidSelectorException and InvalidExtractorException.
|
665
|
+
# Also raised when a rule is defined with invalid arguments.
|
666
|
+
class InvalidRuleException < Exception
|
667
|
+
end
|
668
|
+
|
669
|
+
# Raised to indicate an invalid selector statement.
|
670
|
+
class InvalidSelectorException < InvalidRuleException
|
671
|
+
end
|
672
|
+
|
673
|
+
# Raised to indicate an invalid extractor statement.
|
674
|
+
class InvalidExtractorException < InvalidRuleException
|
675
|
+
end
|
676
|
+
|
677
|
+
|
678
|
+
end
|
679
|
+
|
680
|
+
|
681
|
+
# A parser for several microformats.
|
682
|
+
#
|
683
|
+
# Defines rules for the following attributes:
|
684
|
+
# tags:: A list of tags based on relTag (array of String)
|
685
|
+
# events:: A list of events based on hCalendar (array of HCalendar)
|
686
|
+
#
|
687
|
+
# For example:
|
688
|
+
# content = Microformats.parse(doc)
|
689
|
+
# puts "Tagged with " + content.tags.join(", ") if content.tags
|
690
|
+
class Microformats
|
691
|
+
|
692
|
+
include MicroformatParser
|
693
|
+
|
694
|
+
# Parses the fields of an hCalendar element.
|
695
|
+
#
|
696
|
+
# Defines rules for the following attributes:
|
697
|
+
# dtstart:: The event's start date/time (a Time object)
|
698
|
+
# dtend:: The event's end date/time (a Time object)
|
699
|
+
# summary:: The event's summary (text value)
|
700
|
+
# description:: The event's description (XML node)
|
701
|
+
# url:: The event's URL (string)
|
702
|
+
#
|
703
|
+
# For example:
|
704
|
+
# content = Microformats.parse(doc)
|
705
|
+
# content.events.each do |event}
|
706
|
+
# put "Event on " + event.dtstart
|
707
|
+
# end
|
708
|
+
class HCalendar
|
709
|
+
|
710
|
+
include MicroformatParser
|
711
|
+
|
712
|
+
extractor :dt_extractor do |node|
|
713
|
+
value = node.attributes['title'] if node.name == 'abbr'
|
714
|
+
value = text(node) unless value
|
715
|
+
value ? Time.parse(value) : nil
|
716
|
+
end
|
717
|
+
|
718
|
+
rule_1 :dtstart, nil, :dt_extractor
|
719
|
+
rule_1 :dtend, nil, :dt_extractor
|
720
|
+
rule_1 :summary, nil, :text
|
721
|
+
rule_1 :description, nil, :xml
|
722
|
+
rule_1 :url, nil, "a@href"
|
723
|
+
rule_1 :location, nil, :xml
|
724
|
+
rule_1 :contact, nil, :xml
|
725
|
+
|
726
|
+
end
|
727
|
+
|
728
|
+
rule :tags, "a[rel~=tag]", "text()"
|
729
|
+
rule :events, ".vevent", HCalendar
|
730
|
+
|
731
|
+
end
|
metadata
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.3
|
3
|
+
specification_version: 1
|
4
|
+
name: uformatparser
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 1.0.0
|
7
|
+
date: 2005-11-20
|
8
|
+
summary: Microformat parser for extracting microcontent from (X)HTML
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: assaf@labnotes.org
|
12
|
+
homepage: http://trac.labnotes.org/cgi-bin/trac.cgi/wiki/Ruby/MicroformatParser
|
13
|
+
rubyforge_project: uformatparser
|
14
|
+
description: "Parser for extracting microcontent from (X)HTML documents, in any number of
|
15
|
+
microformats. Uses a DSL for specifying the parsing rules as a set of selectors
|
16
|
+
and extractors. Supports a CSS-like selector and XPath-like extractor syntaxes
|
17
|
+
for quick and easy rule writing. Also supports reusable and compound rules,"
|
18
|
+
autorequire: uformatparser.rb
|
19
|
+
default_executable:
|
20
|
+
bindir: bin
|
21
|
+
has_rdoc: true
|
22
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
23
|
+
requirements:
|
24
|
+
-
|
25
|
+
- ">"
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: 0.0.0
|
28
|
+
version:
|
29
|
+
platform: ruby
|
30
|
+
authors:
|
31
|
+
- Assaf Arkin
|
32
|
+
files:
|
33
|
+
- lib/uformatparser.rb
|
34
|
+
- README
|
35
|
+
- Rakefile
|
36
|
+
- MIT-LICENSE
|
37
|
+
test_files: []
|
38
|
+
rdoc_options:
|
39
|
+
- "--main"
|
40
|
+
- README
|
41
|
+
- "--title"
|
42
|
+
- Microformat parser
|
43
|
+
- "--line-numbers"
|
44
|
+
extra_rdoc_files:
|
45
|
+
- README
|
46
|
+
executables: []
|
47
|
+
extensions: []
|
48
|
+
requirements:
|
49
|
+
- ReXML. HTML->ReXML parser.
|
50
|
+
dependencies: []
|