imw 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (111) hide show
  1. data/.gitignore +15 -0
  2. data/CHANGELOG +0 -0
  3. data/LICENSE +674 -0
  4. data/README.rdoc +101 -0
  5. data/Rakefile +20 -0
  6. data/VERSION +1 -0
  7. data/etc/imwrc.rb +76 -0
  8. data/lib/imw.rb +42 -0
  9. data/lib/imw/boot.rb +58 -0
  10. data/lib/imw/dataset.rb +233 -0
  11. data/lib/imw/dataset/datamapper.rb +66 -0
  12. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
  13. data/lib/imw/dataset/loaddump.rb +50 -0
  14. data/lib/imw/dataset/old/file_collection.rb +88 -0
  15. data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
  16. data/lib/imw/dataset/scaffold.rb +132 -0
  17. data/lib/imw/dataset/scraped_uri.rb +305 -0
  18. data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
  19. data/lib/imw/dataset/scrub/scrub.rb +147 -0
  20. data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
  21. data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
  22. data/lib/imw/dataset/scrub/slug.rb +101 -0
  23. data/lib/imw/dataset/stats.rb +73 -0
  24. data/lib/imw/dataset/stats/counter.rb +23 -0
  25. data/lib/imw/dataset/task.rb +38 -0
  26. data/lib/imw/dataset/workflow.rb +81 -0
  27. data/lib/imw/files.rb +110 -0
  28. data/lib/imw/files/archive.rb +113 -0
  29. data/lib/imw/files/basicfile.rb +122 -0
  30. data/lib/imw/files/binary.rb +28 -0
  31. data/lib/imw/files/compressed_file.rb +93 -0
  32. data/lib/imw/files/compressed_files_and_archives.rb +348 -0
  33. data/lib/imw/files/compressible.rb +103 -0
  34. data/lib/imw/files/csv.rb +112 -0
  35. data/lib/imw/files/json.rb +41 -0
  36. data/lib/imw/files/sgml.rb +65 -0
  37. data/lib/imw/files/text.rb +68 -0
  38. data/lib/imw/files/yaml.rb +46 -0
  39. data/lib/imw/packagers.rb +8 -0
  40. data/lib/imw/packagers/archiver.rb +108 -0
  41. data/lib/imw/packagers/s3_mover.rb +28 -0
  42. data/lib/imw/parsers.rb +7 -0
  43. data/lib/imw/parsers/html_parser.rb +382 -0
  44. data/lib/imw/parsers/html_parser/matchers.rb +306 -0
  45. data/lib/imw/parsers/line_parser.rb +87 -0
  46. data/lib/imw/parsers/regexp_parser.rb +72 -0
  47. data/lib/imw/utils.rb +24 -0
  48. data/lib/imw/utils/components.rb +61 -0
  49. data/lib/imw/utils/config.rb +46 -0
  50. data/lib/imw/utils/error.rb +54 -0
  51. data/lib/imw/utils/extensions/array.rb +125 -0
  52. data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
  53. data/lib/imw/utils/extensions/core.rb +43 -0
  54. data/lib/imw/utils/extensions/dir.rb +24 -0
  55. data/lib/imw/utils/extensions/file_core.rb +64 -0
  56. data/lib/imw/utils/extensions/hash.rb +218 -0
  57. data/lib/imw/utils/extensions/hpricot.rb +48 -0
  58. data/lib/imw/utils/extensions/string.rb +49 -0
  59. data/lib/imw/utils/extensions/struct.rb +42 -0
  60. data/lib/imw/utils/extensions/symbol.rb +28 -0
  61. data/lib/imw/utils/extensions/typed_struct.rb +22 -0
  62. data/lib/imw/utils/extensions/uri.rb +59 -0
  63. data/lib/imw/utils/log.rb +67 -0
  64. data/lib/imw/utils/misc.rb +63 -0
  65. data/lib/imw/utils/paths.rb +115 -0
  66. data/lib/imw/utils/uri.rb +59 -0
  67. data/lib/imw/utils/uuid.rb +33 -0
  68. data/lib/imw/utils/validate.rb +38 -0
  69. data/lib/imw/utils/version.rb +12 -0
  70. data/lib/imw/utils/view.rb +113 -0
  71. data/lib/imw/utils/view/dump_csv.rb +112 -0
  72. data/lib/imw/utils/view/dump_csv_older.rb +117 -0
  73. data/spec/data/sample.csv +131 -0
  74. data/spec/data/sample.tsv +131 -0
  75. data/spec/data/sample.txt +131 -0
  76. data/spec/data/sample.xml +653 -0
  77. data/spec/data/sample.yaml +652 -0
  78. data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
  79. data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
  80. data/spec/imw/files/archive_spec.rb +118 -0
  81. data/spec/imw/files/basicfile_spec.rb +121 -0
  82. data/spec/imw/files/bz2_spec.rb +32 -0
  83. data/spec/imw/files/compressed_file_spec.rb +96 -0
  84. data/spec/imw/files/compressible_spec.rb +100 -0
  85. data/spec/imw/files/file_spec.rb +144 -0
  86. data/spec/imw/files/gz_spec.rb +32 -0
  87. data/spec/imw/files/rar_spec.rb +33 -0
  88. data/spec/imw/files/tar_spec.rb +31 -0
  89. data/spec/imw/files/text_spec.rb +23 -0
  90. data/spec/imw/files/zip_spec.rb +31 -0
  91. data/spec/imw/files_spec.rb +38 -0
  92. data/spec/imw/packagers/archiver_spec.rb +125 -0
  93. data/spec/imw/packagers/s3_mover_spec.rb +7 -0
  94. data/spec/imw/parsers/line_parser_spec.rb +96 -0
  95. data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
  96. data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
  97. data/spec/imw/utils/extensions/find_spec.rb +113 -0
  98. data/spec/imw/utils/paths_spec.rb +38 -0
  99. data/spec/imw/workflow/rip/local_spec.rb +89 -0
  100. data/spec/imw/workflow/rip_spec.rb +27 -0
  101. data/spec/rcov.opts +1 -0
  102. data/spec/spec.opts +4 -0
  103. data/spec/spec_helper.rb +32 -0
  104. data/spec/support/archive_contents_matcher.rb +94 -0
  105. data/spec/support/custom_matchers.rb +21 -0
  106. data/spec/support/directory_contents_matcher.rb +61 -0
  107. data/spec/support/extensions.rb +18 -0
  108. data/spec/support/file_contents_matcher.rb +50 -0
  109. data/spec/support/random.rb +210 -0
  110. data/spec/support/without_regard_to_order_matcher.rb +58 -0
  111. metadata +196 -0
@@ -0,0 +1,108 @@
1
+ module IMW
2
+ module Packagers
3
+
4
+ # Packages an Array of input files into a single output archive.
5
+ # When the archive is extracted, all the input files given will be
6
+ # in a single directory with a chosen name. The path to the output
7
+ # archive determines both the name of the archive and its type (tar,
8
+ # tar.bz2, zip, &c.).
9
+ #
10
+ # If any of the input files are themselves archives, they will first
11
+ # be extracted, with only their contents winding up in the final
12
+ # directory (the file hierarchy of the archive will be preserved).
13
+ # If any of the input files are compressed, they will first be
14
+ # uncompressed before being added to the directory.
15
+ #
16
+ # Input files can be renamed by passing in a Hash instead of an
17
+ # Array. Each key in this hash is the path to an input file and its
18
+ # value is the new basename to give it. If the basename is +nil+
19
+ # then the original path's basename will be used.
20
+ class Archiver
21
+
22
+ attr_accessor :name, :inputs
23
+
24
+ def initialize name, inputs
25
+ @name = name
26
+ add_inputs inputs
27
+ end
28
+
29
+ def add_inputs new_inputs
30
+ @inputs ||= {}
31
+ if new_inputs.is_a?(Array)
32
+ new_inputs.each do |input|
33
+ @inputs[File.expand_path(input)] = File.basename(input)
34
+ end
35
+ else
36
+ new_inputs.each_pair do |input, basename|
37
+ @inputs[File.expand_path(input)] = (basename || File.basename(input))
38
+ end
39
+ end
40
+ end
41
+
42
+ def errors
43
+ @errors ||= []
44
+ end
45
+
46
+ def add_processing_error error
47
+ IMW.logger.warn error
48
+ errors << error
49
+ end
50
+
51
+ def success?
52
+ errors.empty?
53
+ end
54
+
55
+ # A temporary directory to work in. Its contents will
56
+ # ultimately consist of a directory named for the package
57
+ # containing all the input files.
58
+ def tmp_dir
59
+ @tmp_dir ||= File.join(IMW.path_to(:tmp_root, 'packager'), (Time.now.to_i.to_s + "-" + $$.to_s)) # guaranteed unique on a node
60
+ end
61
+
62
+ def clean!
63
+ FileUtils.rm_rf(tmp_dir)
64
+ end
65
+
66
+ # A directory which will contain all the content being packaged,
67
+ # including the contents of any archives that were included in
68
+ # the list of files to process.
69
+ def dir
70
+ @dir ||= File.join(tmp_dir, name.to_s)
71
+ end
72
+
73
+ def prepare!
74
+ FileUtils.mkdir_p dir unless File.exist?(dir)
75
+ inputs.each_pair do |path, basename|
76
+ new_path = File.join(dir, basename)
77
+ file = IMW.open(path, :as => IMW::Files.file_class_for(basename)) # file's original path is meaningless: RackMultipart20091203-958-1nkgc61-0
78
+ case
79
+ when file.archive?
80
+ FileUtils.cd(dir) do
81
+ file.extract
82
+ end
83
+ when file.compressed?
84
+ file.cp(new_path).decompress!
85
+ else
86
+ file.cp(new_path)
87
+ end
88
+ end
89
+ end
90
+
91
+ # Package the contents of the temporary directory to an archive
92
+ # at +output+.
93
+ def package! output, options={}
94
+ output = IMW.open(output) if output.is_a?(String)
95
+ FileUtils.mkdir_p(output.dirname) unless File.exist?(output.dirname)
96
+ output.rm! if output.exist?
97
+ FileUtils.cd(tmp_dir) do
98
+ temp_output = IMW.open(output.basename)
99
+ packaged_output = temp_output.create(name.to_s + '/*').mv(output.path)
100
+ temp_output.rm if temp_output.exist?
101
+ add_processing_error "Archiver: couldn't create archive #{output.path}" unless output.exists?
102
+ end
103
+ output
104
+ end
105
+ end
106
+ end
107
+ end
108
+
@@ -0,0 +1,28 @@
1
+ require 'aws/s3'
2
+ module IMW
3
+ module Packagers
4
+ class S3Mover
5
+
6
+ attr_reader :last_response
7
+ attr_accessor :bucket_name
8
+
9
+ def initialize options={}
10
+ @bucket_name = options.delete(:bucket_name)
11
+ AWS::S3::Base.establish_connection!(options)
12
+ end
13
+
14
+ def success?
15
+ errors.empty?
16
+ end
17
+
18
+ def success?
19
+ last_response && last_response.response.class == Net::HTTPOK
20
+ end
21
+
22
+ def upload! local_path, remote_path
23
+ @last_response = AWS::S3::S3Object.store(remote_path, open(local_path), bucket_name)
24
+ end
25
+
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,7 @@
1
+ module IMW
2
+ module Parsers
3
+ autoload :HTML, 'imw/parsers/html_parser'
4
+ autoload :LineParser, 'imw/parsers/line_parser'
5
+ autoload :RegexpParser, 'imw/parsers/regexp_parser'
6
+ end
7
+ end
@@ -0,0 +1,382 @@
1
+ #
2
+ # h2. lib/imw/parsers/html_parser.rb -- html parser
3
+ #
4
+ # == About
5
+ #
6
+ # h4. HTML Extractor
7
+ #
8
+ # * map repeating HTML elements to intermediate ruby data structure
9
+ # * optimize all the common cases for expressive brevity
10
+ # * output structure will come from HTML structure; map to desired output objects in transform stage.
11
+ # * spec shouldn't be allowed to get too much more complicated than this; again, transform stage exists
12
+ #
13
+ # If this doesn't yield satisfaction you may enjoy
14
+ # * http://blog.labnotes.org/2006/07/11/scraping-with-style-scrapi-toolkit-for-ruby/
15
+ # * http://scrubyt.org/
16
+ # Note of course that these have quite different goals. For example, we don't
17
+ # have any interest in "interactive" crawling, eg form submission, or at least
18
+ # that goes elsewhere.
19
+ #
20
+ #
21
+ # == Sample HTML (http://twitter.com:
22
+ #
23
+ # <ul class="about vcard entry-author">
24
+ # <li ><span class="label">Name</span> <span class="fn" >MarsPhoenix </span> </li>
25
+ # <li ><span class="label">Location</span> <span class="adr">Mars, Solar System</span> </li>
26
+ # <li id="bio"><span class="label">Bio</span> <span class="bio">I dig Mars! </span> </li>
27
+ # <li ><span class="label">Web</span>
28
+ # <a href="http://tinyurl.com/5wwaru" class="url" rel="me nofollow">http://tinyurl.co...</a></li>
29
+ # </ul>
30
+ #
31
+ # == Parser Spec:
32
+ # :hcard => m_one('//ul.vcard.about',
33
+ # {
34
+ # :name => 'li/span.fn',
35
+ # :location => 'li/span.adr',
36
+ # :url => m_attr('li/a.url[@href]', 'href'),
37
+ # :bio => 'li#bio/span.bio',
38
+ # }
39
+ # )
40
+ #
41
+ # == Example return:
42
+ # { :hcard => { :name => 'Mars Phoenix', :location => 'Mars, Solar System', :bio => 'I dig Mars!', :url => 'http://tinyurl.com/5wwaru' } }
43
+ #
44
+ # == Sample HTML (http://delicious.com):
45
+ # <ul id="bookmarklist" class="bookmarks NOTHUMB">
46
+ # <li class="post" id="item-...">
47
+ # <div class="bookmark NOTHUMB">
48
+ # <div class="dateGroup"> <span title="23 APR 08">23 APR 08</span> </div>
49
+ # <div class="data">
50
+ # <h4> <a rel="nofollow" class="taggedlink" href="http://www.cs.biu.ac.il/~koppel/BlogCorpus.htm">Blog Authorship Corpus (Blogger.com 1994)</a>
51
+ # <a class="inlinesave" href="...">SAVE</a> </h4>
52
+ # <h5 class="savers-label"> PEOPLE</h5>
53
+ # <div class="savers savers2"> <a class="delNav" href="/url/7df6661946fca61863312644eb071953"><span class="delNavCount">26</span></a> </div>
54
+ # <div class="description"> The Blog Authorship Corpus consists of the collected posts of 19,320 bloggers gathered from blogger.com in August 2004. The corpus incorporates a total of 681,288 posts and over 140 million words - or approximately 35 posts and 7250 words per person. </div>
55
+ # </div>
56
+ # <div class="meta"></div>
57
+ # <h5 class="tag-chain-label">TAGS</h5>
58
+ # <div class="tagdisplay">
59
+ # <ul class="tag-chain">
60
+ # <li class="tag-chain-item off first"><a class="tag-chain-item-link" rel="tag" href="/infochimps/blog" ><span class="tag-chain-item-span">blog</span> </a></li>
61
+ # <li class="tag-chain-item off"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/corpus" ><span class="tag-chain-item-span">corpus</span> </a></li>
62
+ # <li class="tag-chain-item off"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/analysis" ><span class="tag-chain-item-span">analysis</span></a></li>
63
+ # <li class="tag-chain-item off"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/nlp" ><span class="tag-chain-item-span">nlp</span> </a></li>
64
+ # <li class="tag-chain-item on last"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/dataset" ><span class="tag-chain-item-span">dataset</span> </a></li>
65
+ # </ul>
66
+ # </div>
67
+ # <div class="clr"></div>
68
+ # </div>
69
+ # </li>
70
+ # </ul>
71
+ #
72
+ # == Parser Specification:
73
+ # :bookmarks => [ 'ul#bookmarklist/li.post/.bookmark',
74
+ # {
75
+ # :date => hash( '.dateGroup/span',
76
+ # [:year, :month, :day] => regexp( '', /(\d{2}) ([A-Z]{3}) (\d{2})/),
77
+ # ),
78
+ # :title => '.data/h4/a.taggedlink',
79
+ # :url => attr( '.data/h4/a.taggedlink', 'href'),
80
+ # :del_link_url => href( '.data/.savers/a.delNav),
81
+ # :num_savers => to_i( '.data/.savers//span.delNavCount'),
82
+ # :description => '.data/.description',
83
+ # :tags => ['.tagdisplay//tag-chain-item-span']
84
+ # }
85
+ # ]
86
+ #
87
+ # == Example output:
88
+ # { :bookmarks => [
89
+ # { :date => { :year => '08', :month => 'APR', :day => '23' },
90
+ # :title => 'Blog Authorship Corpus (Blogger.com 1994)',
91
+ # :url => 'http://www.cs.biu.ac.il/~koppel/BlogCorpus.htm',
92
+ # :del_link_url => '/url/7df6661946fca61863312644eb071953',
93
+ # :num_savers => 26,
94
+ # :description => 'The Blog ... ',
95
+ # :tags => ['blog', 'corpus', 'analysis', 'nlp', 'dataset'],
96
+ # }
97
+ # ]}
98
+ #
99
+ # == Implementation:
100
+ #
101
+ # Internally, we take the spec and turn it into a recursive structure of Matcher
102
+ # objects. These consume Hpricot Elements and return the appropriately extracted
103
+ # object.
104
+ #
105
+ # Note that the /default/ is for a bare selector to match ONE element, and to not
106
+ # complain if there are many.
107
+ #
108
+ # Missing elements are silently ignored -- for example if
109
+ # :foo => 'li.missing'
110
+ # there will simply be no :foo element in the hash (as opposed to having hsh[:foo]
111
+ # set to nil -- hsh.include?(foo) will be false)
112
+ #
113
+ #
114
+ # == List of Matchers:
115
+ # { :field => /spec/, ... } # hash hash, each field taken from spec.
116
+ # [ "hpricot_path" ] # 1-el array array: for each element matching
117
+ # hpricot_path, the inner_html
118
+ # [ "hpricot_path", /spec/ ] # 2-el array array: for each element matching
119
+ # hpricot_path, pass to spec
120
+ # "hpricot_path" # string same as one("hpricot_path")
121
+ # one("hpricot_path") # one first match to hpricot_path
122
+ # one("hpricot_path", /spec/) # one applies spec to first match to hpricot_path
123
+ # (these all match on one path:)
124
+ # regexp("hpricot_path", /RE/) # regexp capture groups from matching RE against
125
+ # inner_html of first match to hpricot_path
126
+ # attr("hpricot_path", 'attr_name') # attr
127
+ # href("hpricot_path") # href shorthand for attr(foo, 'href')
128
+ # no_html # strip tags from contents
129
+ # html_encoded # html encode contents
130
+ # to_i, to_f, etc # convert
131
+ # lambda{|doc| ... } # proc calls proc on current doc
132
+ #
133
+ # == Complicated HCard example:
134
+ # :hcards => [ '//ul.users/li.vcard',
135
+ # {
136
+ # :name => '.fn',
137
+ # :address => one('.adr',
138
+ # :street => '.street',
139
+ # :city => '.city',
140
+ # :zip => '.postal'
141
+ # )
142
+ # :tel => [ 'span.tel',
143
+ # {
144
+ # :type => 'span.type',
145
+ # [:cc, :area, :num] => hp.regexp('span.value', /+(\d+).(\d{3})-(\d{3}-\d{4})/),
146
+ # }
147
+ # ]
148
+ # :tags => [ '.tag' ],
149
+ # }
150
+ # ]
151
+ #
152
+ # == Resulting Parser
153
+ # MatchHash({:hcards => MatchArray('//ul.users/li.hcard',
154
+ # MatchHash({
155
+ # :name => MatchFirst('.fn'),
156
+ # :address => MatchFirst('.adr',
157
+ # MatchHash({
158
+ # :street => MatchFirst('.street'),
159
+ # :city => MatchFirst('.locality),
160
+ # :state => MatchFirst('.region),
161
+ # :zip => MatchFirst('.postal'),
162
+ # }))
163
+ # :tel => MatchArray('span.tel',
164
+ # MatchHash({
165
+ # :type => MatchFirst('span.type'),
166
+ # [:cc, :area, :num] => RegexpMatcher('span.value', /+(\d+).(\d{3})-(\d{3}-\d{4})/),
167
+ # })
168
+ # )
169
+ # :tags => MatchArray('.tag'),
170
+ # })
171
+ # )
172
+ #
173
+ # == Example output
174
+ # [
175
+ # {:tel => [ {:type => 'home', :cc => '49', :area => '305', :num => '555-1212'},
176
+ # {:type => 'work', :cc => '49', :area => '305', :num => '555-6969'}, ],
177
+ # :name => "Bob Dobbs, Jr.",
178
+ # :tags => ["church"] },
179
+ # {:tel => [ {:type => 'fax', :cc => '49', :area => '305', :num => '867-5309'}, ],
180
+ # :name => "Jenny",
181
+ # :address => { :street => "53 Evergreen Terr.", :city => "Springfield" },
182
+ # :tags => ["bathroom", "wall"] },
183
+ # ]
184
+ #
185
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
186
+ # Copyright:: Copyright (c) 2008 infochimps.org
187
+ # License:: GPL 3.0
188
+ # Website:: http://infinitemonkeywrench.org/
189
+ #
190
+ # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
191
+
192
+ require 'imw/parsers/html_parser/matchers'
193
+
194
+ class IMW::HTMLParser
195
+
196
+ include IMW::HTMLParserMatcher
197
+
198
+ attr_accessor :parse_tree
199
+
200
+ #
201
+ # Parse Tree
202
+ #
203
+ def initialize arg_spec=nil
204
+ spec = arg_spec || self.class.parser_spec
205
+ self.parse_tree = IMW::HTMLParserMatcher.build_parse_tree(spec)
206
+ end
207
+
208
+ #
209
+ # See IMW::HTMLParser for syntax
210
+ #
211
+ #
212
+ def self.parser_spec
213
+ raise "Override this to create your own parser spec"
214
+ end
215
+
216
+ #
217
+ # Walk
218
+ #
219
+ def parse doc
220
+ self.parse_tree.match(doc)
221
+ end
222
+
223
+ # one("hpricot_path") first match to hpricot_path
224
+ # one("hpricot_path", /spec/) applies spec to first match to hpricot_path
225
+ #
226
+ def self.one selector, matcher
227
+ MatchFirstElement.new(selector, IMW::HTMLParserMatcher.build_parse_tree(matcher))
228
+ end
229
+ # match the +attr+ attribute of the first element given by +selector+
230
+ def self.attr selector, attr, matcher=nil
231
+ MatchAttribute.new(selector, attr, IMW::HTMLParserMatcher.build_parse_tree(matcher))
232
+ end
233
+ # shorthand for +attr(foo, 'href')+
234
+ def self.href selector, matcher=nil
235
+ self.attr(selector, 'href', matcher)
236
+ end
237
+ # shorthand for +attr(foo, 'src')+
238
+ def self.src selector, matcher=nil
239
+ self.attr(selector, 'src', matcher)
240
+ end
241
+
242
+ def self.proc selector, proc, matcher=nil
243
+ MatchProc.new(selector, proc, IMW::HTMLParserMatcher.build_parse_tree(matcher))
244
+ end
245
+
246
+ # strip ","s (!! thus disrespecting locale !!!)
247
+ # and convert to int
248
+ def self.to_num selector, matcher=nil
249
+ proc selector, lambda{|num| num.to_s.gsub(/,/,'').to_i if num }, matcher
250
+ end
251
+ def self.to_json selector, matcher=nil
252
+ proc selector, lambda{|v| v.to_json if v }, matcher
253
+ end
254
+
255
+ def self.strip selector, matcher=nil
256
+ proc selector, lambda{|v| v.strip }, matcher
257
+ end
258
+
259
+ def self.re_group selector, re
260
+ MatchRegexp.new(selector, re)
261
+ end
262
+ def self.re selector, re
263
+ MatchRegexp.new(selector, re, nil, :capture => 1)
264
+ end
265
+ def self.re_all selector, re, matcher=nil
266
+ MatchRegexpRepeatedly.new(selector, re)
267
+ end
268
+
269
+ # def self.plain_text selector, matcher=nil
270
+ # proc selector, lambda{|el| el.inner_text if el }, matcher
271
+ # end
272
+
273
+ # attr_accessor :mapping
274
+ #
275
+ # #
276
+ # # Feed me a hash and I'll semantify HTML
277
+ # #
278
+ # # The hash should magically adhere to the too-complicated,
279
+ # # ever evolving goatrope that works for the below
280
+ # #
281
+ # #
282
+ # def initialize mapping
283
+ # self.mapping = mapping
284
+ # end
285
+ #
286
+ # #
287
+ # # take a document subtree,
288
+ # # and a mapping of hpricot paths to that subtree's data mapping
289
+ # # recursively extract that datamapping
290
+ # #
291
+ # def extract_tree hdoc, content, sub_mapping
292
+ # data = { }
293
+ # sub_mapping.each do |selector, target|
294
+ # data[selector] = []
295
+ # sub_contents = content/selector
296
+ # sub_contents.each do |sub_content|
297
+ # sub_data = {}
298
+ # extract_node hdoc, sub_content, sub_data, selector, target
299
+ # data[selector] << sub_data
300
+ # end
301
+ # end
302
+ # data
303
+ # # end
304
+ # # if selector.is_a?(String)
305
+ # # conts = (content)
306
+ # # else
307
+ # # conts = [content]
308
+ # # end
309
+ # # conts[0..0].each do |content|
310
+ # # extract_node hdoc, content, data, selector, target
311
+ # # end
312
+ # # end
313
+ # data
314
+ # end
315
+ #
316
+ # #
317
+ # # insert the extracted element into the data mapping
318
+ # #
319
+ # def extract_node hdoc, content, data, selector, target
320
+ # classification = classify_node(selector, target)
321
+ # result = \
322
+ # case classification
323
+ # when :subtree
324
+ # target.each do |sub_selector, sub_target|
325
+ # extract_node hdoc, content, data, sub_selector, sub_target
326
+ # end
327
+ #
328
+ # when :sub_attribute
329
+ # k, v = selector.to_a[0]
330
+ # subcontent = (k[0..0] == '/') ? (hdoc.at(k)) : (content.at(k))
331
+ # val = subcontent.attributes[v.to_s] if subcontent
332
+ # data[target] = val unless val.blank?
333
+ #
334
+ # when :attribute then
335
+ # val = content.attributes[selector.to_s]
336
+ # data[target] = val unless val.blank?
337
+ #
338
+ # when :flatten_list
339
+ # subcontents = (selector[0..0] == '/') ? (hdoc/selector) : (content/selector)
340
+ # data[target.first] = subcontents.map{|subcontent| subcontent.inner_html }
341
+ #
342
+ # when :inner_html
343
+ # subcontent = (selector[0..0] == '/') ? (hdoc.at(selector)) : (content.at(selector))
344
+ # data[target] = subcontent.inner_html.strip if subcontent
345
+ #
346
+ # else
347
+ # raise "classify_node shouldn't ever return #{classification}"
348
+ # end
349
+ # # puts "%-19s %-19s %-31s %s" % [target.inspect[0..18], classification.inspect[0..18], selector.inspect[0..30], result.inspect[0..90]] if (classification == :sub_attribute)
350
+ # # puts '' if classification == :subtree
351
+ # end
352
+ #
353
+ # def classify_node selector, target
354
+ # case
355
+ # when target.is_a?(Hash) then :subtree
356
+ # when selector.is_a?(Hash) && (selector.length == 1) then
357
+ # k, v = selector.to_a[0]
358
+ # case v
359
+ # when Symbol then :sub_attribute
360
+ # end
361
+ # when selector.is_a?(Symbol) then :attribute
362
+ # when selector.is_a?(String) && target.is_a?(Array) then :flatten_list
363
+ # when selector.is_a?(String) && target.is_a?(Symbol) then :inner_html
364
+ # else
365
+ # raise "Can't classify mapping: " + [selector, target].join(" - ")
366
+ # end
367
+ # end
368
+ #
369
+ # # use #mapping to parse file
370
+ # def parse link
371
+ # begin hdoc = Hpricot(link.contents)
372
+ # rescue; warn "can't hpricot #{link.to_s}" ; return false; end
373
+ # raw_taggings = extract_tree hdoc, hdoc, self.mapping
374
+ # end
375
+ #
376
+ # # use #mapping to parse file
377
+ # def parse_file filename
378
+ # begin hdoc = Hpricot(File.open(filename))
379
+ # rescue; warn "can't hpricot #{filename}" ; return false; end
380
+ # raw_taggings = extract_tree hdoc, hdoc, self.mapping
381
+ # end
382
+ end