uformatparser 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +20 -0
- data/README +44 -0
- data/Rakefile +74 -0
- data/lib/uformatparser.rb +731 -0
- metadata +50 -0
data/MIT-LICENSE
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Copyright (c) 2005 Assaf Arkin
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
4
|
+
a copy of this software and associated documentation files (the
|
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
9
|
+
the following conditions:
|
|
10
|
+
|
|
11
|
+
The above copyright notice and this permission notice shall be
|
|
12
|
+
included in all copies or substantial portions of the Software.
|
|
13
|
+
|
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
= Microformat Parser
|
|
2
|
+
|
|
3
|
+
MicroformatParser is a Ruby module for creating microformat parsers.
|
|
4
|
+
A microformat parser is a class with a set of rules for extracting
|
|
5
|
+
interesting content from (X)HTML documents. You create your own parser
|
|
6
|
+
by writing a class with a set of rules. The magic happens in the parse
|
|
7
|
+
method which taks an (X)HTML document or element, runs all the rules
|
|
8
|
+
on it, and returns new object that holds the extracted valus.
|
|
9
|
+
|
|
10
|
+
Here's a simple example to find all links and all tags in a document:
|
|
11
|
+
|
|
12
|
+
class MyParser
|
|
13
|
+
include MicroformatParser
|
|
14
|
+
|
|
15
|
+
rule :links, "a", "a@href"
|
|
16
|
+
rule :tags, "a[rel~=tag]", "text()"
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
content = MyParser.parse(doc)
|
|
20
|
+
puts "Found " + content.links.size + " links" if content.links
|
|
21
|
+
puts "Tagged with " + content.tags.join(', ') if content.tags
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
== Documentation
|
|
25
|
+
|
|
26
|
+
You may want to read the documentation for a more details discussion of
|
|
27
|
+
selectors, extractors, compound rules, (X)HTML parsing and examples
|
|
28
|
+
|
|
29
|
+
http://trac.labnotes.org/cgi-bin/trac.cgi/wiki/Ruby/MicroformatParser
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
== Download
|
|
33
|
+
|
|
34
|
+
The latest version of can be found at
|
|
35
|
+
|
|
36
|
+
http://rubyforge.org/projects/uformatparser/
|
|
37
|
+
|
|
38
|
+
== License
|
|
39
|
+
|
|
40
|
+
This package is licensed under the MIT license and/or the {Creative
|
|
41
|
+
Commons Attribution-ShareAlike}[http://creativecommons.org/licenses/by-sa/2.5/legalcode].
|
|
42
|
+
|
|
43
|
+
:include: MIT-LICENSE
|
|
44
|
+
|
data/Rakefile
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Adapted from the rake Rakefile.
|
|
2
|
+
|
|
3
|
+
require 'rubygems'
|
|
4
|
+
Gem::manage_gems
|
|
5
|
+
require 'rake/testtask'
|
|
6
|
+
require 'rake/rdoctask'
|
|
7
|
+
require 'rake/gempackagetask'
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
desc "Default Task"
|
|
11
|
+
task :default => [:tests, :rdoc]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
Rake::TestTask.new :tests do |test|
|
|
15
|
+
test.verbose = true
|
|
16
|
+
test.test_files = ['test/*.rb']
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Create the documentation.
|
|
21
|
+
Rake::RDocTask.new do |rdoc|
|
|
22
|
+
rdoc.main = "README"
|
|
23
|
+
rdoc.rdoc_files.include("README", "lib/**/*.rb")
|
|
24
|
+
rdoc.title = 'Microformat Parser'
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Create the GEM package.
|
|
29
|
+
gem_spec = Gem::Specification.new do |spec|
|
|
30
|
+
spec.name = 'uformatparser'
|
|
31
|
+
spec.version = "1.0.0"
|
|
32
|
+
spec.summary = "Microformat parser for extracting microcontent from (X)HTML"
|
|
33
|
+
spec.description = <<-EOF
|
|
34
|
+
Parser for extracting microcontent from (X)HTML documents, in any number
|
|
35
|
+
of microformats.
|
|
36
|
+
|
|
37
|
+
Uses a DSL for specifying the parsing rules as a set of selectors and
|
|
38
|
+
extractors. Supports a CSS-like selector and XPath-like extractor syntaxes
|
|
39
|
+
for quick and easy rule writing. Also supports reusable and compound rules,
|
|
40
|
+
EOF
|
|
41
|
+
spec.author = "Assaf Arkin"
|
|
42
|
+
spec.email = "assaf@labnotes.org"
|
|
43
|
+
spec.homepage = "http://trac.labnotes.org/cgi-bin/trac.cgi/wiki/Ruby/MicroformatParser"
|
|
44
|
+
|
|
45
|
+
spec.files = FileList["{test,lib}/**/*", "README", "Rakefile", "MIT-LICENSE"].to_a
|
|
46
|
+
spec.require_path = "lib"
|
|
47
|
+
spec.autorequire = 'uformatparser.rb'
|
|
48
|
+
spec.requirements << "ReXML. HTML->ReXML parser."
|
|
49
|
+
spec.has_rdoc = true
|
|
50
|
+
spec.rdoc_options << '--main' << 'README' << '--title' << 'Microformat parser' << '--line-numbers'
|
|
51
|
+
spec.extra_rdoc_files = ["README"]
|
|
52
|
+
spec.rubyforge_project = "uformatparser"
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
gem = Rake::GemPackageTask.new(gem_spec) do |pkg|
|
|
56
|
+
pkg.need_tar = true
|
|
57
|
+
pkg.need_zip = true
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# --------------------------------------------------------------------
|
|
62
|
+
# Creating a release
|
|
63
|
+
|
|
64
|
+
desc "Make a new release"
|
|
65
|
+
task :release => [:tests, :clobber, :package] do
|
|
66
|
+
puts
|
|
67
|
+
puts "**************************************************************"
|
|
68
|
+
puts "* Release #{gem_spec.version} Complete."
|
|
69
|
+
puts "* Packages ready to upload."
|
|
70
|
+
puts "**************************************************************"
|
|
71
|
+
puts
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
|
|
@@ -0,0 +1,731 @@
|
|
|
1
|
+
#
|
|
2
|
+
# = uformatparser.rb - Microformat parser
|
|
3
|
+
#
|
|
4
|
+
#--
|
|
5
|
+
# Author:: Assaf Arkin assaf@labnotes.org
|
|
6
|
+
# Documentation:: http://trac.labnotes.org/cgi-bin/trac.cgi/wiki/Ruby/MicroformatParser
|
|
7
|
+
# Copyright:: Copyright (c) 2005 Assaf Arkin
|
|
8
|
+
# License:: Creative Commons Attribution-ShareAlike
|
|
9
|
+
#
|
|
10
|
+
#++
|
|
11
|
+
|
|
12
|
+
require 'rexml/document'
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Implements a microformat parser by extending a class that includes this module.
|
|
16
|
+
#
|
|
17
|
+
# === The Basics
|
|
18
|
+
#
|
|
19
|
+
# To create a microformat parser, extend a class with this module and use the
|
|
20
|
+
# +rule+ method to define parsing rules for that class. Call +parse+ to parse the
|
|
21
|
+
# content, returning a new instance of the class holding all values extracted from
|
|
22
|
+
# parsing. You can parse a document or an element.
|
|
23
|
+
#
|
|
24
|
+
# For example:
|
|
25
|
+
# class Microformats
|
|
26
|
+
# include MicroformatParser
|
|
27
|
+
#
|
|
28
|
+
# class HCalendar
|
|
29
|
+
# include MicroformatParser
|
|
30
|
+
#
|
|
31
|
+
# # Extract ISO date/time
|
|
32
|
+
# extractor :dt_extractor do |node|
|
|
33
|
+
# value = node.attributes['title'] if node.name == 'abbr'
|
|
34
|
+
# value = text(node) unless value
|
|
35
|
+
# value ? Time.parse(value) : nil
|
|
36
|
+
# end
|
|
37
|
+
#
|
|
38
|
+
# rule_1 :dtstart, nil, :dt_extractor
|
|
39
|
+
# rule_1 :dtend, nil, :dt_extractor
|
|
40
|
+
# rule_1 :summary, nil, :text
|
|
41
|
+
# rule_1 :description, nil, :xml
|
|
42
|
+
# rule_1 :url, nil, "a@href"
|
|
43
|
+
# end
|
|
44
|
+
#
|
|
45
|
+
# rule :tags, "a[rel~=tag]", "text()"
|
|
46
|
+
# rule :events, ".vevent", HCalendar
|
|
47
|
+
# end
|
|
48
|
+
#
|
|
49
|
+
# content = Microformats.parse(doc)
|
|
50
|
+
# puts content.tags
|
|
51
|
+
# puts content.events
|
|
52
|
+
#
|
|
53
|
+
module MicroformatParser
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# Create a new rule.
|
|
57
|
+
#
|
|
58
|
+
# There are two ways to define a rule:
|
|
59
|
+
# * rule name, selector?, extractor?, limit?
|
|
60
|
+
# * rule name, limit? { block }
|
|
61
|
+
#
|
|
62
|
+
# The +name+ argument specifies an instance variable that holds the value
|
|
63
|
+
# (or values) extracted from processing this rule. It can be a string or
|
|
64
|
+
# a symbol. An attribute accessor is created with that name.
|
|
65
|
+
#
|
|
66
|
+
# The +selector+ argument identifies all nodes that match the rule. It can
|
|
67
|
+
# be an CSS-style selector (string) or a method/proc. A symbol specifies
|
|
68
|
+
# a method to use from this class. The method/proc receives a single argument
|
|
69
|
+
# with the node and must return true/false.
|
|
70
|
+
#
|
|
71
|
+
# If selector is absent, the default selector will match any element with
|
|
72
|
+
# a class of the same name as the name argument. For example:
|
|
73
|
+
# rule :dtstart
|
|
74
|
+
# Matches all elements with the class _dtstart_.
|
|
75
|
+
#
|
|
76
|
+
# The +extractor+ argument specifies how to extract a value from a selected
|
|
77
|
+
# node. It can be a list of extract rules (string), a method/proc, or a class.
|
|
78
|
+
# A symbol specifies a method to use from this class. The method/proce receives
|
|
79
|
+
# a single argument with the node and returns the extracted value, or nil.
|
|
80
|
+
#
|
|
81
|
+
# If the extractor is a class, it references a microformat parser which is
|
|
82
|
+
# then called to parse the content of a matching element.
|
|
83
|
+
#
|
|
84
|
+
# If extractor is absent, the default extractor is used:
|
|
85
|
+
# abbr@title|a@href|text()
|
|
86
|
+
#
|
|
87
|
+
# The +limit+ argument specifies the cardinality of the rule's value:
|
|
88
|
+
# 0 The rule is never applied
|
|
89
|
+
# 1 The rule is applied once, the first extracted value is set
|
|
90
|
+
# -1 The rule is applied multiple times, extracted values are set in an array
|
|
91
|
+
# n The rule is applied up to _n_ times, extracted values are set in an array
|
|
92
|
+
#
|
|
93
|
+
# In the second form, a block is specified instead of the selector/extractor.
|
|
94
|
+
# The block is called with a node and returns the extracted value, or nil.
|
|
95
|
+
def rule(name, selector = nil, extractor = nil, limit = -1, &proc)
|
|
96
|
+
raise InvalidRuleException, "First argument (rule name) is required" unless name
|
|
97
|
+
if proc
|
|
98
|
+
# The rule processing is taken from the block, everything else must be nil
|
|
99
|
+
raise InvalidRuleException, "Can't specify selector/extractor in combination with proc" if selector or extractor
|
|
100
|
+
rule = Rule.new(name, nil, proc, limit)
|
|
101
|
+
else
|
|
102
|
+
# Determine the selector.
|
|
103
|
+
selector = case selector
|
|
104
|
+
when NilClass
|
|
105
|
+
# Absent selector: create a selector that matches element with the same
|
|
106
|
+
# class as the rule name
|
|
107
|
+
match = Regexp.new("\\b#{name.to_s}\\b")
|
|
108
|
+
proc { |node| node.attributes['class'] =~ match }
|
|
109
|
+
when String
|
|
110
|
+
# CSS-style selector
|
|
111
|
+
Selector.create(selector)
|
|
112
|
+
when Proc, Method
|
|
113
|
+
# Use as is
|
|
114
|
+
selector
|
|
115
|
+
when Symbol
|
|
116
|
+
# Find named method and use that as the selector
|
|
117
|
+
# Since the instance method is unbound, we bind it to this class
|
|
118
|
+
selector = method(selector)
|
|
119
|
+
raise InvalidSelectorException, "Method #{name.to_s} is not a valid selector" unless selector
|
|
120
|
+
selector
|
|
121
|
+
else
|
|
122
|
+
raise InvalidSelectorException, "Invalid selector type: must be a string, symbol, proc/method or nil"
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Determine the extractor
|
|
126
|
+
extractor = case extractor
|
|
127
|
+
when NilClass
|
|
128
|
+
# Absent extractor: either block if provided, otherwise default extractor
|
|
129
|
+
default_extractor
|
|
130
|
+
when String
|
|
131
|
+
# Extractor expression
|
|
132
|
+
Extractor.new(self, extractor)
|
|
133
|
+
when Proc, Method
|
|
134
|
+
# Use as is
|
|
135
|
+
extractor
|
|
136
|
+
when Symbol
|
|
137
|
+
# Find named method and use that as the extractor
|
|
138
|
+
# Since the instance method is unbound, we bind it to this class
|
|
139
|
+
extractor = method(extractor)
|
|
140
|
+
raise InvalidExtractorException, "Method #{name.to_s} is not a valid extractor" unless extractor
|
|
141
|
+
extractor
|
|
142
|
+
when Class
|
|
143
|
+
# Extractor is a class, generally another ruleset, so we call
|
|
144
|
+
# its parse method (must exist).
|
|
145
|
+
begin
|
|
146
|
+
extractor.method(:parse)
|
|
147
|
+
rescue NameError=>error
|
|
148
|
+
raise InvalidExtractorException, "Extractor class must implement the method parse", error.backtrace
|
|
149
|
+
end
|
|
150
|
+
extractor
|
|
151
|
+
else
|
|
152
|
+
raise InvalidExtractorException, "Invalid extractor type: must be a string, parser class, block or nil"
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Create a new rule, to invoke its process method
|
|
156
|
+
rule = Rule.new(name, selector, extractor, limit)
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Create an accessor for an attribute with the same name as the rule
|
|
160
|
+
# The accessor will hold the rule value
|
|
161
|
+
attr name, true
|
|
162
|
+
# Add this rule to class's ruleset
|
|
163
|
+
self.rules << rule
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
# Create a new rule that extracts at most one value.
|
|
168
|
+
#
|
|
169
|
+
# Same as calling +rule+ with +limit+=1
|
|
170
|
+
def rule_1(name, selector = nil, extractor = nil, &proc)
|
|
171
|
+
# Rule with limit of one value
|
|
172
|
+
rule(name, selector, extractor, 1, &proc)
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
# Creates a new selector.
|
|
177
|
+
#
|
|
178
|
+
# There are two ways to create a selector:
|
|
179
|
+
# * selector name, statement
|
|
180
|
+
# * selector name { block }
|
|
181
|
+
#
|
|
182
|
+
# The +name+ argument (a string or symbol) specifies the selector name,
|
|
183
|
+
# defining a class method with that name that can be used to identify matching
|
|
184
|
+
# element.
|
|
185
|
+
#
|
|
186
|
+
# The selector can be a CSS-style selector (string) or a block that accepts a
|
|
187
|
+
# single argument (element) and returns true or false.
|
|
188
|
+
#
|
|
189
|
+
# For example:
|
|
190
|
+
# selector :select_link { |node| node.name == 'a' }
|
|
191
|
+
# extractor :extract_link { |node| node.attributes['href'] }
|
|
192
|
+
# rule :links, :select_link, :extract_link
|
|
193
|
+
def selector(name, selector = nil, &proc)
|
|
194
|
+
raise InvalidSelectorException, "First argument (rule name) is required" unless name
|
|
195
|
+
selector = case selector
|
|
196
|
+
when NilClass
|
|
197
|
+
# Absent selector: either block is provided, or we create a selector
|
|
198
|
+
# that matches element with the same class as the selector name
|
|
199
|
+
if proc
|
|
200
|
+
proc
|
|
201
|
+
else
|
|
202
|
+
match = Regexp.new("\\b#{name.to_s}\\b")
|
|
203
|
+
proc { |node| node.attributes['class'] =~ match }
|
|
204
|
+
end
|
|
205
|
+
when String
|
|
206
|
+
# CSS-style selector
|
|
207
|
+
Selector.create(selector)
|
|
208
|
+
else
|
|
209
|
+
raise InvalidSelectorException, "Invalid selector type: must be a string, block or nil"
|
|
210
|
+
end
|
|
211
|
+
# Create a class method using the selector name that calls the
|
|
212
|
+
# selector's match method.
|
|
213
|
+
class << self
|
|
214
|
+
self
|
|
215
|
+
end.instance_eval { define_method(name) { |node| selector.call(node) } }
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
# Creates a new extractor.
|
|
220
|
+
#
|
|
221
|
+
# There are two ways to create an extractor:
|
|
222
|
+
# * extractor name, statement
|
|
223
|
+
# * extractor selector name { block }
|
|
224
|
+
#
|
|
225
|
+
# The +name+ argument (string or symbol) specifies the extractor name,
|
|
226
|
+
# defining a class method with that name that can be used to extract the
|
|
227
|
+
# value of a node.
|
|
228
|
+
#
|
|
229
|
+
# The extractor can be an expression (string) or a block that accepts a
|
|
230
|
+
# single argument (element) and returns the extracted value, or nil.
|
|
231
|
+
#
|
|
232
|
+
# For example:
|
|
233
|
+
# selector :select_link { |node| node.name == 'a' }
|
|
234
|
+
# extractor :extract_link { |node| node.attributes['href'] }
|
|
235
|
+
# rule :links, :select_link, :extract_link
|
|
236
|
+
#
|
|
237
|
+
# The expression takes the form of:
|
|
238
|
+
# extractor := extract (|extract)*
|
|
239
|
+
# extract := element | @attribute | element@attribute | method()
|
|
240
|
+
#
|
|
241
|
+
# If multiple extracts are specified, the first extracted value is used.
|
|
242
|
+
#
|
|
243
|
+
# If an element is specified, the text value is extracted only if the selected
|
|
244
|
+
# node is an element of that type. If an attribute is specified, the extracted
|
|
245
|
+
# value is the attribute's value. If both element and attribute are used, the
|
|
246
|
+
# attribute value is extracted only if the selected node is an element of that
|
|
247
|
+
# type.
|
|
248
|
+
#
|
|
249
|
+
# If a method is specified, that method is called for the node. There are two
|
|
250
|
+
# methods available in any class: +text+ and +xml+.
|
|
251
|
+
def extractor(name, extractor = nil, &proc)
|
|
252
|
+
raise InvalidExtractorException, "First argument (rule name) is required" unless name
|
|
253
|
+
extractor = case extractor
|
|
254
|
+
when NilClass
|
|
255
|
+
# Absent extractor: either block if provided, otherwise default extractor
|
|
256
|
+
proc ? proc : default_extractor
|
|
257
|
+
when String
|
|
258
|
+
# Extractor expression
|
|
259
|
+
Extractor.new(extractor)
|
|
260
|
+
else
|
|
261
|
+
raise InvalidExtractorException, "Invalid extractor type: must be a string, parser class, block or nil"
|
|
262
|
+
end
|
|
263
|
+
# Create a class method using the extractor name that calls the
|
|
264
|
+
# extractor's extract method.
|
|
265
|
+
class << self
|
|
266
|
+
self
|
|
267
|
+
end.instance_eval { define_method(name) { |node| extractor.call(node) } }
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
# Returns the default extractor.
|
|
271
|
+
def default_extractor()
|
|
272
|
+
return DEFAULT_EXTRACTOR
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
# Called to parse a node.
|
|
276
|
+
#
|
|
277
|
+
# The node may be an element (REXML::Element) or a document (REXML::Document).
|
|
278
|
+
#
|
|
279
|
+
# For example:
|
|
280
|
+
# class ParseLinks
|
|
281
|
+
# include MicroformatParser
|
|
282
|
+
#
|
|
283
|
+
# rule :links, "a", "@href"
|
|
284
|
+
# rule :ids, "a[@id]", "a@id"
|
|
285
|
+
# end
|
|
286
|
+
#
|
|
287
|
+
# parsed = ParseLinks.parse(doc)
|
|
288
|
+
# puts parsed.links
|
|
289
|
+
# puts parsed.ids
|
|
290
|
+
def parse(node, context = nil, rules = nil)
|
|
291
|
+
# Create a new object unless one is provided. This method can be
|
|
292
|
+
# called on the class (creating a new instance) or on an object (recursive)
|
|
293
|
+
context = self.new() unless context
|
|
294
|
+
# Obtain the rules for this class unless provided by caller.
|
|
295
|
+
rules = self.rules unless rules
|
|
296
|
+
# Rules are reduced during processing. If a rule matches a node, that rule
|
|
297
|
+
# is not applied to any child nodes (structured rules will process child nodes
|
|
298
|
+
# directly). However, other rules are allowed to process the child nodes.
|
|
299
|
+
# Removing a rule modifies the ruleset, requiring it to be cloned.
|
|
300
|
+
less_rules = nil
|
|
301
|
+
# We must have rules and the node must be an element/document
|
|
302
|
+
if rules and node.kind_of? REXML::Element
|
|
303
|
+
# Iterate over all the rules and process them. Remove any matching rules
|
|
304
|
+
# from this ruleset -- the new ruleset will be used on child nodes.
|
|
305
|
+
rules.each_with_index do |rule, index|
|
|
306
|
+
if rule and rule.process(node, context)
|
|
307
|
+
less_rules = rules.clone unless less_rules
|
|
308
|
+
less_rules[index] = nil
|
|
309
|
+
end
|
|
310
|
+
end
|
|
311
|
+
rules = less_rules if less_rules
|
|
312
|
+
node.elements.each { |child| parse(child, context, less_rules) }
|
|
313
|
+
end
|
|
314
|
+
context
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
# Returns all the rules for this class.
|
|
318
|
+
#
|
|
319
|
+
# Returns an array of rules defined with +rule+.
|
|
320
|
+
#
|
|
321
|
+
# You can use this method to inspect rules, add/remove rules, etc. Rules are
|
|
322
|
+
# processed in the order in which they are added.
|
|
323
|
+
def rules
|
|
324
|
+
rules = @microparser_rules
|
|
325
|
+
@microparser_rules = rules = Array.new() unless rules
|
|
326
|
+
rules
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
# Returns the text value of a node.
|
|
330
|
+
def text(node)
|
|
331
|
+
value = ''
|
|
332
|
+
for child in node.children
|
|
333
|
+
if child.instance_of? REXML::Text
|
|
334
|
+
value += child.value
|
|
335
|
+
elsif child.instance_of? REXML::Element
|
|
336
|
+
value += text(child)
|
|
337
|
+
end
|
|
338
|
+
end
|
|
339
|
+
value
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
# Returns the XML value of a node (the node itself).
|
|
343
|
+
def xml(node)
|
|
344
|
+
node
|
|
345
|
+
end
|
|
346
|
+
|
|
347
|
+
module_function :text, :xml
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
private
|
|
352
|
+
|
|
353
|
+
def self.included(mod)
|
|
354
|
+
mod.extend(self)
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
# Implements a rule.
|
|
359
|
+
#
|
|
360
|
+
# A rule identifies matching nodes using a selector, and a means to extract their value
|
|
361
|
+
# using an extractor. The rule also identifies an instance variable and attribute accessor
|
|
362
|
+
# to retrieve the extracted value, and the cardinality of that value.
|
|
363
|
+
#
|
|
364
|
+
# For more information see MicroformatParser.rule.
|
|
365
|
+
class Rule
|
|
366
|
+
|
|
367
|
+
# The instance variable/attribute accessor name.
|
|
368
|
+
attr :name
|
|
369
|
+
# The rule cardinality (or value limit)
|
|
370
|
+
# 0:: No value (disabled)
|
|
371
|
+
# 1:: First value extracted
|
|
372
|
+
# n:: Up to n values (array)
|
|
373
|
+
# -1:: Unbound (array)
|
|
374
|
+
attr :limit,true
|
|
375
|
+
# The rule selector
|
|
376
|
+
attr :selector
|
|
377
|
+
# The rule extractor
|
|
378
|
+
attr :extractor
|
|
379
|
+
|
|
380
|
+
def initialize(name, selector, extractor, limit)
|
|
381
|
+
# Change the rule name to the attribute name holding the result
|
|
382
|
+
@name = "@#{name.to_s}".to_sym
|
|
383
|
+
@selector = selector
|
|
384
|
+
@extractor = extractor
|
|
385
|
+
@limit = limit
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
# Called to process this rule on a node with a context object.
|
|
389
|
+
#
|
|
390
|
+
# Returns true if the rule was processed and should be reduced (not applied to
|
|
391
|
+
# any child nodes). Otherwise, returns false.
|
|
392
|
+
def process(node, context)
|
|
393
|
+
# Do nothing if rule is disabled (limit = 0), reduce it.
|
|
394
|
+
return true if @limit == 0
|
|
395
|
+
# Do nothing if rule is singular (limit = 1) and a value was already set
|
|
396
|
+
current = context.instance_variable_get(@name)
|
|
397
|
+
return true if @limit == 1 and current
|
|
398
|
+
# Match the current node, do nothing if not matched
|
|
399
|
+
# (Selector may be nil if rule created to wrap a proc)
|
|
400
|
+
if @selector
|
|
401
|
+
return false unless @selector.instance_of?(UnboundMethod) ? @selector.bind(context).call(node) :
|
|
402
|
+
@selector.instance_of?(Selector) ? @selector.match(node) : @selector.call(node)
|
|
403
|
+
end
|
|
404
|
+
# Extract the value. Do nothing if nothing extracted
|
|
405
|
+
value = case @extractor
|
|
406
|
+
when UnboundMethod
|
|
407
|
+
@extractor.bind(context).call(node)
|
|
408
|
+
when Extractor
|
|
409
|
+
@extractor.extract(node)
|
|
410
|
+
when Proc, Method
|
|
411
|
+
@extractor.call(node)
|
|
412
|
+
when Class
|
|
413
|
+
@extractor.parse(node)
|
|
414
|
+
end
|
|
415
|
+
return false unless value
|
|
416
|
+
# If limit=1, set the new value (singular)
|
|
417
|
+
# If no current value, create new array with new value
|
|
418
|
+
# Otherwise, if no limit or limit not reach, append value to
|
|
419
|
+
# existing array
|
|
420
|
+
if @limit == 1
|
|
421
|
+
context.instance_variable_set(@name, value)
|
|
422
|
+
elsif not current
|
|
423
|
+
context.instance_variable_set(@name, [value])
|
|
424
|
+
elsif current.instance_of? Array and (@limit < 0 or current.size < @limit)
|
|
425
|
+
current << value
|
|
426
|
+
end
|
|
427
|
+
# We always return true, since there's no point in applying
|
|
428
|
+
# the rule to any child nodes.
|
|
429
|
+
return true
|
|
430
|
+
end
|
|
431
|
+
|
|
432
|
+
def inspect
|
|
433
|
+
@selector ? "[to #{@name} from #{@selector.inspect}, #{@extractor.inspect}, limit #{@limit}]" : "[to #{@name} from #{@extractor.inspect}, limit #{@limit}]"
|
|
434
|
+
end
|
|
435
|
+
|
|
436
|
+
end
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
# Implements a selector using a CSS-style expression.
|
|
440
|
+
#
|
|
441
|
+
# For more information see MicroformatParser.selector.
|
|
442
|
+
class Selector
|
|
443
|
+
|
|
444
|
+
# :stopdoc:
|
|
445
|
+
|
|
446
|
+
# Parse each selector into five parts:
|
|
447
|
+
# $1 element name or * (optional)
|
|
448
|
+
# $2 ID name (including leading #, optional)
|
|
449
|
+
# $3 class names (including leading ., zero or more)
|
|
450
|
+
# $4 attribute expressions (zero or more)
|
|
451
|
+
# $5 anything else (no leading spaces)
|
|
452
|
+
REGEX = /^(\*|[A-Za-z][A-Za-z0-9_\-:]*)?(#[A-Za-z][A-Za-z0-9_\-:]*)?((?:\.[A-Za-z][A-Za-z0-9_\-:]*){0,})((?:\[[A-Za-z][A-Za-z0-9_\-:]*(?:(?:~|\|)?=.*)?\]){0,})\s*(.*)$/
|
|
453
|
+
|
|
454
|
+
# Parse each attribute expression into three parts:
|
|
455
|
+
# $1 attribute name
|
|
456
|
+
# $2 matching operation
|
|
457
|
+
# $3 matched value
|
|
458
|
+
# Matching operation may be =, ~= or |=. Value may be empty.
|
|
459
|
+
ATTR_REGEX = /^([A-Za-z][A-Za-z0-9_\-:]*)((?:~|\|)?=)?(.*)$/
|
|
460
|
+
|
|
461
|
+
# :startdoc:
|
|
462
|
+
|
|
463
|
+
def initialize(tag_name, attrs, alt = nil, &depends)
|
|
464
|
+
@tag_name = tag_name
|
|
465
|
+
@attrs = attrs
|
|
466
|
+
@alt = alt
|
|
467
|
+
@depends = depends
|
|
468
|
+
end
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
public
|
|
472
|
+
# Creates a new selector.
|
|
473
|
+
def Selector.create(statement, alt = nil, &depends)
|
|
474
|
+
statement.strip!
|
|
475
|
+
# Parse the first selector expression into $1-$4, anything else goes in $5
|
|
476
|
+
parts = REGEX.match(statement)
|
|
477
|
+
raise InvalidSelectorException, "Invalid (empty) selector statement" if parts[0].length == 0
|
|
478
|
+
|
|
479
|
+
# Set tag_name to the element name if specified and not *
|
|
480
|
+
tag_name = parts[1] if parts[1] and !parts[1].empty? and parts[1] != '*'
|
|
481
|
+
# This array holds the regular expressions for matching attributes.
|
|
482
|
+
# We use an array since we allow multiple expressions on the same attribute,
|
|
483
|
+
# e.g. to find an element with both class 'foo' and class 'bar'.
|
|
484
|
+
attrs = []
|
|
485
|
+
# Match the ID attribute if specified
|
|
486
|
+
attrs << ['id', Regexp.new('^' + parts[2] + '$')] if parts[2]
|
|
487
|
+
# The third part is a collection of class names, prefixed with dot
|
|
488
|
+
# Create an attribute matching regular expression for each class
|
|
489
|
+
# The class attribute is a set of space-separated names, so match accordingly
|
|
490
|
+
if !parts[3].empty?
|
|
491
|
+
parts[3].split('.').each { |cls| attrs << ['class', Regexp.new('\b' + cls + '\b')] if !cls.empty? }
|
|
492
|
+
end
|
|
493
|
+
# Process the remaining attribute expressions. Each expression is enclosed
|
|
494
|
+
# within square brackets, so split the expressions into anything between the
|
|
495
|
+
# square brackets. The result may include empty elements, skip those.
|
|
496
|
+
parts[4].split(/\[|\]/).each do |expr|
|
|
497
|
+
if not expr.empty?
|
|
498
|
+
# Parse the attribute expression and created a regular expression
|
|
499
|
+
# for matching the attribute value, based on the operation.
|
|
500
|
+
name, type, value = ATTR_REGEX.match(expr)[1..3]
|
|
501
|
+
case type
|
|
502
|
+
when '=' then
|
|
503
|
+
# Match the attribute value in full
|
|
504
|
+
match = Regexp.new('^' + value + '$')
|
|
505
|
+
when '~=' then
|
|
506
|
+
# Match a space-separated word within the attribute value
|
|
507
|
+
match = Regexp.new('\b' + value + '\b')
|
|
508
|
+
when '|=' then
|
|
509
|
+
# Match the beginning of the attribute value
|
|
510
|
+
match = Regexp.new('^' + value)
|
|
511
|
+
else
|
|
512
|
+
# Match all attributes values (existence check)
|
|
513
|
+
match = Regexp.new('')
|
|
514
|
+
end
|
|
515
|
+
attrs << [name, match]
|
|
516
|
+
end
|
|
517
|
+
end
|
|
518
|
+
# If there's nothing else in the statement, return this selector.
|
|
519
|
+
selector = Selector.new(tag_name, attrs, alt, &depends)
|
|
520
|
+
return selector if parts[5].empty?
|
|
521
|
+
|
|
522
|
+
# Create a compound selector based on the remainder of the statement.
|
|
523
|
+
# This is also why we need the factory and can't call new directly.
|
|
524
|
+
return case parts[5][0]
|
|
525
|
+
when ?,
|
|
526
|
+
# Alternative selector: second statement is alternative to the first one
|
|
527
|
+
Selector.create(parts[5][1..-1], selector)
|
|
528
|
+
when ?+
|
|
529
|
+
# Sibling selector: second statement is returned that will match node
|
|
530
|
+
# followed by previous sibling node based on first statement
|
|
531
|
+
Selector.create(parts[5][1..-1]) do |node|
|
|
532
|
+
node.previous_element and selector.match(node.previous_element)
|
|
533
|
+
end
|
|
534
|
+
when ?>
|
|
535
|
+
# Child selector: second statement is returned that will match node
|
|
536
|
+
# followed by parent node based on the first statement
|
|
537
|
+
Selector.create(parts[5][1..-1]) do |node|
|
|
538
|
+
node.parent? and selector.match(node.parent)
|
|
539
|
+
end
|
|
540
|
+
else
|
|
541
|
+
# Descendant selector: second statement is returned that will match node
|
|
542
|
+
# followed by ascendant node based on the first statement
|
|
543
|
+
Selector.create(parts[5]) do |node|
|
|
544
|
+
parent = node.parent
|
|
545
|
+
match = false
|
|
546
|
+
while parent
|
|
547
|
+
break if match = selector.match(parent)
|
|
548
|
+
parent = parent.parent
|
|
549
|
+
end
|
|
550
|
+
match
|
|
551
|
+
end
|
|
552
|
+
end
|
|
553
|
+
end
|
|
554
|
+
|
|
555
|
+
# Creates a new selector for the given class name.
|
|
556
|
+
def Selector.for_class(cls)
|
|
557
|
+
Selector.new(nil, [["class", Regexp.new('\b' + cls + '\b')]])
|
|
558
|
+
end
|
|
559
|
+
|
|
560
|
+
# Identifies all matching nodes.
|
|
561
|
+
def match(node)
|
|
562
|
+
# Match node if no element name or element name same as node name
|
|
563
|
+
match = (!@tag_name or @tag_name == node.name)
|
|
564
|
+
# No match if one of the attribute matches failed
|
|
565
|
+
for attr in @attrs
|
|
566
|
+
if attr[1] !~ node.attributes[attr[0]]
|
|
567
|
+
match = false
|
|
568
|
+
break
|
|
569
|
+
end
|
|
570
|
+
end
|
|
571
|
+
# If the node did not match, but we have an alternative match
|
|
572
|
+
# (x+y), apply the alternative match instead
|
|
573
|
+
return @alt.match(node) if not match and @alt
|
|
574
|
+
# If the node did match, but depends on another match (parent,
|
|
575
|
+
# sibling, etc), apply the dependent match as well
|
|
576
|
+
return @depends.call(node) if match and @depends
|
|
577
|
+
match
|
|
578
|
+
end
|
|
579
|
+
|
|
580
|
+
def inspect
|
|
581
|
+
stmt = @tag_name ? @tag_name : '';
|
|
582
|
+
@attrs.each do |attr|
|
|
583
|
+
stmt += "[#{attr[0]}"
|
|
584
|
+
stmt += "~=#{$1}" if attr[1].source =~ /^\\b(.*)\\b$/
|
|
585
|
+
stmt += "=#{$1}" if attr[1].source =~ /^\^(.*)\$$/
|
|
586
|
+
stmt += "|=#{$1}" if attr[1].source =~ /^\^[^$]*$/
|
|
587
|
+
end
|
|
588
|
+
stmt += ']'
|
|
589
|
+
stmt += ',' + @alt.inspect if @alt
|
|
590
|
+
stmt
|
|
591
|
+
end
|
|
592
|
+
|
|
593
|
+
end
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
# Implements an extractor using a simple expression format.
|
|
597
|
+
#
|
|
598
|
+
# For more information see MicroformatParser.extractor.
|
|
599
|
+
class Extractor
|
|
600
|
+
|
|
601
|
+
# :stopdoc:
|
|
602
|
+
|
|
603
|
+
# Parse each extractor into three parts:
|
|
604
|
+
# $1 function name (excluding parentheses)
|
|
605
|
+
# $2 element name
|
|
606
|
+
# $3 attribute name (including leading @)
|
|
607
|
+
# If a match is found the result is either $1, or $2 and/or $3
|
|
608
|
+
REGEX = /^(\w+)\(\)|([A-Za-z][A-Za-z0-9_\-:]*)?(@[A-Za-z][A-Za-z0-9_\-:]*)?$/
|
|
609
|
+
|
|
610
|
+
# :startdoc:
|
|
611
|
+
|
|
612
|
+
def initialize(context, statement)
|
|
613
|
+
statement.strip!
|
|
614
|
+
@extracts = []
|
|
615
|
+
# Break the statement into multiple extraction rules, separated by |.
|
|
616
|
+
statement.split('|').each do |extract|
|
|
617
|
+
parts = REGEX.match(extract)
|
|
618
|
+
if parts[1] then
|
|
619
|
+
# Function. Find a method in the context object (the rule class),
|
|
620
|
+
# report an error is not found.
|
|
621
|
+
begin
|
|
622
|
+
@extracts << context.method(parts[1]) # context.
|
|
623
|
+
rescue NameError=>error
|
|
624
|
+
raise InvalidExtractorException, error.message, error.backtrace
|
|
625
|
+
end
|
|
626
|
+
elsif parts[2] and parts[3]
|
|
627
|
+
# Apply only if element of this type, and extract the named attribute.
|
|
628
|
+
attr_name = parts[3][1..-1]
|
|
629
|
+
@extracts << proc { |node| node.attributes[attr_name] if node.name == parts[2] }
|
|
630
|
+
elsif parts[2]
|
|
631
|
+
# Apply only if element of this type, and extract the text value.
|
|
632
|
+
@extracts << proc { |node| text(node) if node.name == parts[2] }
|
|
633
|
+
elsif parts[3]
|
|
634
|
+
# Extract the named attribute.
|
|
635
|
+
attr_name = parts[3][1..-1]
|
|
636
|
+
@extracts << proc { |node| node.attributes[attr_name] }
|
|
637
|
+
else
|
|
638
|
+
raise InvalidExtractorException, "Invalid extraction statement"
|
|
639
|
+
end
|
|
640
|
+
end
|
|
641
|
+
raise InvalidExtractorException, "Invalid (empty) extraction statement" if @extracts.size == 0
|
|
642
|
+
end
|
|
643
|
+
|
|
644
|
+
public
|
|
645
|
+
# Extracts a value from the node based on the extractor expression.
|
|
646
|
+
def extract(node)
|
|
647
|
+
# Iterate over all extraction rules, returning the first value.
|
|
648
|
+
value = nil
|
|
649
|
+
@extracts.each do |extract|
|
|
650
|
+
value = extract.call(node)
|
|
651
|
+
break if value
|
|
652
|
+
end
|
|
653
|
+
value
|
|
654
|
+
end
|
|
655
|
+
|
|
656
|
+
def inspect
|
|
657
|
+
@extracts.join('|')
|
|
658
|
+
end
|
|
659
|
+
|
|
660
|
+
end
|
|
661
|
+
|
|
662
|
+
DEFAULT_EXTRACTOR = Extractor.new(self, "abbr@title|a@href|text()")
|
|
663
|
+
|
|
664
|
+
# Base class for InvalidSelectorException and InvalidExtractorException.
|
|
665
|
+
# Also raised when a rule is defined with invalid arguments.
|
|
666
|
+
class InvalidRuleException < Exception
|
|
667
|
+
end
|
|
668
|
+
|
|
669
|
+
# Raised to indicate an invalid selector statement.
|
|
670
|
+
class InvalidSelectorException < InvalidRuleException
|
|
671
|
+
end
|
|
672
|
+
|
|
673
|
+
# Raised to indicate an invalid extractor statement.
|
|
674
|
+
class InvalidExtractorException < InvalidRuleException
|
|
675
|
+
end
|
|
676
|
+
|
|
677
|
+
|
|
678
|
+
end
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
# A parser for several microformats.
|
|
682
|
+
#
|
|
683
|
+
# Defines rules for the following attributes:
|
|
684
|
+
# tags:: A list of tags based on relTag (array of String)
|
|
685
|
+
# events:: A list of events based on hCalendar (array of HCalendar)
|
|
686
|
+
#
|
|
687
|
+
# For example:
|
|
688
|
+
# content = Microformats.parse(doc)
|
|
689
|
+
# puts "Tagged with " + content.tags.join(", ") if content.tags
|
|
690
|
+
class Microformats
|
|
691
|
+
|
|
692
|
+
include MicroformatParser
|
|
693
|
+
|
|
694
|
+
# Parses the fields of an hCalendar element.
|
|
695
|
+
#
|
|
696
|
+
# Defines rules for the following attributes:
|
|
697
|
+
# dtstart:: The event's start date/time (a Time object)
|
|
698
|
+
# dtend:: The event's end date/time (a Time object)
|
|
699
|
+
# summary:: The event's summary (text value)
|
|
700
|
+
# description:: The event's description (XML node)
|
|
701
|
+
# url:: The event's URL (string)
|
|
702
|
+
#
|
|
703
|
+
# For example:
|
|
704
|
+
# content = Microformats.parse(doc)
|
|
705
|
+
# content.events.each do |event}
|
|
706
|
+
# put "Event on " + event.dtstart
|
|
707
|
+
# end
|
|
708
|
+
class HCalendar
|
|
709
|
+
|
|
710
|
+
include MicroformatParser
|
|
711
|
+
|
|
712
|
+
extractor :dt_extractor do |node|
|
|
713
|
+
value = node.attributes['title'] if node.name == 'abbr'
|
|
714
|
+
value = text(node) unless value
|
|
715
|
+
value ? Time.parse(value) : nil
|
|
716
|
+
end
|
|
717
|
+
|
|
718
|
+
rule_1 :dtstart, nil, :dt_extractor
|
|
719
|
+
rule_1 :dtend, nil, :dt_extractor
|
|
720
|
+
rule_1 :summary, nil, :text
|
|
721
|
+
rule_1 :description, nil, :xml
|
|
722
|
+
rule_1 :url, nil, "a@href"
|
|
723
|
+
rule_1 :location, nil, :xml
|
|
724
|
+
rule_1 :contact, nil, :xml
|
|
725
|
+
|
|
726
|
+
end
|
|
727
|
+
|
|
728
|
+
rule :tags, "a[rel~=tag]", "text()"
|
|
729
|
+
rule :events, ".vevent", HCalendar
|
|
730
|
+
|
|
731
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
rubygems_version: 0.8.3
|
|
3
|
+
specification_version: 1
|
|
4
|
+
name: uformatparser
|
|
5
|
+
version: !ruby/object:Gem::Version
|
|
6
|
+
version: 1.0.0
|
|
7
|
+
date: 2005-11-20
|
|
8
|
+
summary: Microformat parser for extracting microcontent from (X)HTML
|
|
9
|
+
require_paths:
|
|
10
|
+
- lib
|
|
11
|
+
email: assaf@labnotes.org
|
|
12
|
+
homepage: http://trac.labnotes.org/cgi-bin/trac.cgi/wiki/Ruby/MicroformatParser
|
|
13
|
+
rubyforge_project: uformatparser
|
|
14
|
+
description: "Parser for extracting microcontent from (X)HTML documents, in any number of
|
|
15
|
+
microformats. Uses a DSL for specifying the parsing rules as a set of selectors
|
|
16
|
+
and extractors. Supports a CSS-like selector and XPath-like extractor syntaxes
|
|
17
|
+
for quick and easy rule writing. Also supports reusable and compound rules,"
|
|
18
|
+
autorequire: uformatparser.rb
|
|
19
|
+
default_executable:
|
|
20
|
+
bindir: bin
|
|
21
|
+
has_rdoc: true
|
|
22
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
-
|
|
25
|
+
- ">"
|
|
26
|
+
- !ruby/object:Gem::Version
|
|
27
|
+
version: 0.0.0
|
|
28
|
+
version:
|
|
29
|
+
platform: ruby
|
|
30
|
+
authors:
|
|
31
|
+
- Assaf Arkin
|
|
32
|
+
files:
|
|
33
|
+
- lib/uformatparser.rb
|
|
34
|
+
- README
|
|
35
|
+
- Rakefile
|
|
36
|
+
- MIT-LICENSE
|
|
37
|
+
test_files: []
|
|
38
|
+
rdoc_options:
|
|
39
|
+
- "--main"
|
|
40
|
+
- README
|
|
41
|
+
- "--title"
|
|
42
|
+
- Microformat parser
|
|
43
|
+
- "--line-numbers"
|
|
44
|
+
extra_rdoc_files:
|
|
45
|
+
- README
|
|
46
|
+
executables: []
|
|
47
|
+
extensions: []
|
|
48
|
+
requirements:
|
|
49
|
+
- ReXML. HTML->ReXML parser.
|
|
50
|
+
dependencies: []
|