imw 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. data/.gitignore +15 -0
  2. data/CHANGELOG +0 -0
  3. data/LICENSE +674 -0
  4. data/README.rdoc +101 -0
  5. data/Rakefile +20 -0
  6. data/VERSION +1 -0
  7. data/etc/imwrc.rb +76 -0
  8. data/lib/imw.rb +42 -0
  9. data/lib/imw/boot.rb +58 -0
  10. data/lib/imw/dataset.rb +233 -0
  11. data/lib/imw/dataset/datamapper.rb +66 -0
  12. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
  13. data/lib/imw/dataset/loaddump.rb +50 -0
  14. data/lib/imw/dataset/old/file_collection.rb +88 -0
  15. data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
  16. data/lib/imw/dataset/scaffold.rb +132 -0
  17. data/lib/imw/dataset/scraped_uri.rb +305 -0
  18. data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
  19. data/lib/imw/dataset/scrub/scrub.rb +147 -0
  20. data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
  21. data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
  22. data/lib/imw/dataset/scrub/slug.rb +101 -0
  23. data/lib/imw/dataset/stats.rb +73 -0
  24. data/lib/imw/dataset/stats/counter.rb +23 -0
  25. data/lib/imw/dataset/task.rb +38 -0
  26. data/lib/imw/dataset/workflow.rb +81 -0
  27. data/lib/imw/files.rb +110 -0
  28. data/lib/imw/files/archive.rb +113 -0
  29. data/lib/imw/files/basicfile.rb +122 -0
  30. data/lib/imw/files/binary.rb +28 -0
  31. data/lib/imw/files/compressed_file.rb +93 -0
  32. data/lib/imw/files/compressed_files_and_archives.rb +348 -0
  33. data/lib/imw/files/compressible.rb +103 -0
  34. data/lib/imw/files/csv.rb +112 -0
  35. data/lib/imw/files/json.rb +41 -0
  36. data/lib/imw/files/sgml.rb +65 -0
  37. data/lib/imw/files/text.rb +68 -0
  38. data/lib/imw/files/yaml.rb +46 -0
  39. data/lib/imw/packagers.rb +8 -0
  40. data/lib/imw/packagers/archiver.rb +108 -0
  41. data/lib/imw/packagers/s3_mover.rb +28 -0
  42. data/lib/imw/parsers.rb +7 -0
  43. data/lib/imw/parsers/html_parser.rb +382 -0
  44. data/lib/imw/parsers/html_parser/matchers.rb +306 -0
  45. data/lib/imw/parsers/line_parser.rb +87 -0
  46. data/lib/imw/parsers/regexp_parser.rb +72 -0
  47. data/lib/imw/utils.rb +24 -0
  48. data/lib/imw/utils/components.rb +61 -0
  49. data/lib/imw/utils/config.rb +46 -0
  50. data/lib/imw/utils/error.rb +54 -0
  51. data/lib/imw/utils/extensions/array.rb +125 -0
  52. data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
  53. data/lib/imw/utils/extensions/core.rb +43 -0
  54. data/lib/imw/utils/extensions/dir.rb +24 -0
  55. data/lib/imw/utils/extensions/file_core.rb +64 -0
  56. data/lib/imw/utils/extensions/hash.rb +218 -0
  57. data/lib/imw/utils/extensions/hpricot.rb +48 -0
  58. data/lib/imw/utils/extensions/string.rb +49 -0
  59. data/lib/imw/utils/extensions/struct.rb +42 -0
  60. data/lib/imw/utils/extensions/symbol.rb +28 -0
  61. data/lib/imw/utils/extensions/typed_struct.rb +22 -0
  62. data/lib/imw/utils/extensions/uri.rb +59 -0
  63. data/lib/imw/utils/log.rb +67 -0
  64. data/lib/imw/utils/misc.rb +63 -0
  65. data/lib/imw/utils/paths.rb +115 -0
  66. data/lib/imw/utils/uri.rb +59 -0
  67. data/lib/imw/utils/uuid.rb +33 -0
  68. data/lib/imw/utils/validate.rb +38 -0
  69. data/lib/imw/utils/version.rb +12 -0
  70. data/lib/imw/utils/view.rb +113 -0
  71. data/lib/imw/utils/view/dump_csv.rb +112 -0
  72. data/lib/imw/utils/view/dump_csv_older.rb +117 -0
  73. data/spec/data/sample.csv +131 -0
  74. data/spec/data/sample.tsv +131 -0
  75. data/spec/data/sample.txt +131 -0
  76. data/spec/data/sample.xml +653 -0
  77. data/spec/data/sample.yaml +652 -0
  78. data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
  79. data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
  80. data/spec/imw/files/archive_spec.rb +118 -0
  81. data/spec/imw/files/basicfile_spec.rb +121 -0
  82. data/spec/imw/files/bz2_spec.rb +32 -0
  83. data/spec/imw/files/compressed_file_spec.rb +96 -0
  84. data/spec/imw/files/compressible_spec.rb +100 -0
  85. data/spec/imw/files/file_spec.rb +144 -0
  86. data/spec/imw/files/gz_spec.rb +32 -0
  87. data/spec/imw/files/rar_spec.rb +33 -0
  88. data/spec/imw/files/tar_spec.rb +31 -0
  89. data/spec/imw/files/text_spec.rb +23 -0
  90. data/spec/imw/files/zip_spec.rb +31 -0
  91. data/spec/imw/files_spec.rb +38 -0
  92. data/spec/imw/packagers/archiver_spec.rb +125 -0
  93. data/spec/imw/packagers/s3_mover_spec.rb +7 -0
  94. data/spec/imw/parsers/line_parser_spec.rb +96 -0
  95. data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
  96. data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
  97. data/spec/imw/utils/extensions/find_spec.rb +113 -0
  98. data/spec/imw/utils/paths_spec.rb +38 -0
  99. data/spec/imw/workflow/rip/local_spec.rb +89 -0
  100. data/spec/imw/workflow/rip_spec.rb +27 -0
  101. data/spec/rcov.opts +1 -0
  102. data/spec/spec.opts +4 -0
  103. data/spec/spec_helper.rb +32 -0
  104. data/spec/support/archive_contents_matcher.rb +94 -0
  105. data/spec/support/custom_matchers.rb +21 -0
  106. data/spec/support/directory_contents_matcher.rb +61 -0
  107. data/spec/support/extensions.rb +18 -0
  108. data/spec/support/file_contents_matcher.rb +50 -0
  109. data/spec/support/random.rb +210 -0
  110. data/spec/support/without_regard_to_order_matcher.rb +58 -0
  111. metadata +196 -0
@@ -0,0 +1,108 @@
1
+ module IMW
2
+ module Packagers
3
+
4
+ # Packages an Array of input files into a single output archive.
5
+ # When the archive is extracted, all the input files given will be
6
+ # in a single directory with a chosen name. The path to the output
7
+ # archive determines both the name of the archive and its type (tar,
8
+ # tar.bz2, zip, &c.).
9
+ #
10
+ # If any of the input files are themselves archives, they will first
11
+ # be extracted, with only their contents winding up in the final
12
+ # directory (the file hierarchy of the archive will be preserved).
13
+ # If any of the input files are compressed, they will first be
14
+ # uncompressed before being added to the directory.
15
+ #
16
+ # Input files can be renamed by passing in a Hash instead of an
17
+ # Array. Each key in this hash is the path to an input file and its
18
+ # value is the new basename to give it. If the basename is +nil+
19
+ # then the original path's basename will be used.
20
+ class Archiver
21
+
22
+ attr_accessor :name, :inputs
23
+
24
+ def initialize name, inputs
25
+ @name = name
26
+ add_inputs inputs
27
+ end
28
+
29
+ def add_inputs new_inputs
30
+ @inputs ||= {}
31
+ if new_inputs.is_a?(Array)
32
+ new_inputs.each do |input|
33
+ @inputs[File.expand_path(input)] = File.basename(input)
34
+ end
35
+ else
36
+ new_inputs.each_pair do |input, basename|
37
+ @inputs[File.expand_path(input)] = (basename || File.basename(input))
38
+ end
39
+ end
40
+ end
41
+
42
+ def errors
43
+ @errors ||= []
44
+ end
45
+
46
+ def add_processing_error error
47
+ IMW.logger.warn error
48
+ errors << error
49
+ end
50
+
51
+ def success?
52
+ errors.empty?
53
+ end
54
+
55
+ # A temporary directory to work in. Its contents will
56
+ # ultimately consist of a directory named for the package
57
+ # containing all the input files.
58
+ def tmp_dir
59
+ @tmp_dir ||= File.join(IMW.path_to(:tmp_root, 'packager'), (Time.now.to_i.to_s + "-" + $$.to_s)) # guaranteed unique on a node
60
+ end
61
+
62
+ def clean!
63
+ FileUtils.rm_rf(tmp_dir)
64
+ end
65
+
66
+ # A directory which will contain all the content being packaged,
67
+ # including the contents of any archives that were included in
68
+ # the list of files to process.
69
+ def dir
70
+ @dir ||= File.join(tmp_dir, name.to_s)
71
+ end
72
+
73
+ def prepare!
74
+ FileUtils.mkdir_p dir unless File.exist?(dir)
75
+ inputs.each_pair do |path, basename|
76
+ new_path = File.join(dir, basename)
77
+ file = IMW.open(path, :as => IMW::Files.file_class_for(basename)) # file's original path is meaningless: RackMultipart20091203-958-1nkgc61-0
78
+ case
79
+ when file.archive?
80
+ FileUtils.cd(dir) do
81
+ file.extract
82
+ end
83
+ when file.compressed?
84
+ file.cp(new_path).decompress!
85
+ else
86
+ file.cp(new_path)
87
+ end
88
+ end
89
+ end
90
+
91
+ # Package the contents of the temporary directory to an archive
92
+ # at +output+.
93
+ def package! output, options={}
94
+ output = IMW.open(output) if output.is_a?(String)
95
+ FileUtils.mkdir_p(output.dirname) unless File.exist?(output.dirname)
96
+ output.rm! if output.exist?
97
+ FileUtils.cd(tmp_dir) do
98
+ temp_output = IMW.open(output.basename)
99
+ packaged_output = temp_output.create(name.to_s + '/*').mv(output.path)
100
+ temp_output.rm if temp_output.exist?
101
+ add_processing_error "Archiver: couldn't create archive #{output.path}" unless output.exists?
102
+ end
103
+ output
104
+ end
105
+ end
106
+ end
107
+ end
108
+
@@ -0,0 +1,28 @@
1
+ require 'aws/s3'
2
+ module IMW
3
+ module Packagers
4
+ class S3Mover
5
+
6
+ attr_reader :last_response
7
+ attr_accessor :bucket_name
8
+
9
+ def initialize options={}
10
+ @bucket_name = options.delete(:bucket_name)
11
+ AWS::S3::Base.establish_connection!(options)
12
+ end
13
+
14
+ def success?
15
+ errors.empty?
16
+ end
17
+
18
+ def success?
19
+ last_response && last_response.response.class == Net::HTTPOK
20
+ end
21
+
22
+ def upload! local_path, remote_path
23
+ @last_response = AWS::S3::S3Object.store(remote_path, open(local_path), bucket_name)
24
+ end
25
+
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,7 @@
1
+ module IMW
2
+ module Parsers
3
+ autoload :HTML, 'imw/parsers/html_parser'
4
+ autoload :LineParser, 'imw/parsers/line_parser'
5
+ autoload :RegexpParser, 'imw/parsers/regexp_parser'
6
+ end
7
+ end
@@ -0,0 +1,382 @@
1
+ #
2
+ # h2. lib/imw/parsers/html_parser.rb -- html parser
3
+ #
4
+ # == About
5
+ #
6
+ # h4. HTML Extractor
7
+ #
8
+ # * map repeating HTML elements to intermediate ruby data structure
9
+ # * optimize all the common cases for expressive brevity
10
+ # * output structure will come from HTML structure; map to desired output objects in transform stage.
11
+ # * spec shouldn't be allowed to get too much more complicated than this; again, transform stage exists
12
+ #
13
+ # If this doesn't yield satisfaction you may enjoy
14
+ # * http://blog.labnotes.org/2006/07/11/scraping-with-style-scrapi-toolkit-for-ruby/
15
+ # * http://scrubyt.org/
16
+ # Note of course that these have quite different goals. For example, we don't
17
+ # have any interest in "interactive" crawling, eg form submission, or at least
18
+ # that goes elsewhere.
19
+ #
20
+ #
21
+ # == Sample HTML (http://twitter.com:
22
+ #
23
+ # <ul class="about vcard entry-author">
24
+ # <li ><span class="label">Name</span> <span class="fn" >MarsPhoenix </span> </li>
25
+ # <li ><span class="label">Location</span> <span class="adr">Mars, Solar System</span> </li>
26
+ # <li id="bio"><span class="label">Bio</span> <span class="bio">I dig Mars! </span> </li>
27
+ # <li ><span class="label">Web</span>
28
+ # <a href="http://tinyurl.com/5wwaru" class="url" rel="me nofollow">http://tinyurl.co...</a></li>
29
+ # </ul>
30
+ #
31
+ # == Parser Spec:
32
+ # :hcard => m_one('//ul.vcard.about',
33
+ # {
34
+ # :name => 'li/span.fn',
35
+ # :location => 'li/span.adr',
36
+ # :url => m_attr('li/a.url[@href]', 'href'),
37
+ # :bio => 'li#bio/span.bio',
38
+ # }
39
+ # )
40
+ #
41
+ # == Example return:
42
+ # { :hcard => { :name => 'Mars Phoenix', :location => 'Mars, Solar System', :bio => 'I dig Mars!', :url => 'http://tinyurl.com/5wwaru' } }
43
+ #
44
+ # == Sample HTML (http://delicious.com):
45
+ # <ul id="bookmarklist" class="bookmarks NOTHUMB">
46
+ # <li class="post" id="item-...">
47
+ # <div class="bookmark NOTHUMB">
48
+ # <div class="dateGroup"> <span title="23 APR 08">23 APR 08</span> </div>
49
+ # <div class="data">
50
+ # <h4> <a rel="nofollow" class="taggedlink" href="http://www.cs.biu.ac.il/~koppel/BlogCorpus.htm">Blog Authorship Corpus (Blogger.com 1994)</a>
51
+ # <a class="inlinesave" href="...">SAVE</a> </h4>
52
+ # <h5 class="savers-label"> PEOPLE</h5>
53
+ # <div class="savers savers2"> <a class="delNav" href="/url/7df6661946fca61863312644eb071953"><span class="delNavCount">26</span></a> </div>
54
+ # <div class="description"> The Blog Authorship Corpus consists of the collected posts of 19,320 bloggers gathered from blogger.com in August 2004. The corpus incorporates a total of 681,288 posts and over 140 million words - or approximately 35 posts and 7250 words per person. </div>
55
+ # </div>
56
+ # <div class="meta"></div>
57
+ # <h5 class="tag-chain-label">TAGS</h5>
58
+ # <div class="tagdisplay">
59
+ # <ul class="tag-chain">
60
+ # <li class="tag-chain-item off first"><a class="tag-chain-item-link" rel="tag" href="/infochimps/blog" ><span class="tag-chain-item-span">blog</span> </a></li>
61
+ # <li class="tag-chain-item off"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/corpus" ><span class="tag-chain-item-span">corpus</span> </a></li>
62
+ # <li class="tag-chain-item off"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/analysis" ><span class="tag-chain-item-span">analysis</span></a></li>
63
+ # <li class="tag-chain-item off"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/nlp" ><span class="tag-chain-item-span">nlp</span> </a></li>
64
+ # <li class="tag-chain-item on last"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/dataset" ><span class="tag-chain-item-span">dataset</span> </a></li>
65
+ # </ul>
66
+ # </div>
67
+ # <div class="clr"></div>
68
+ # </div>
69
+ # </li>
70
+ # </ul>
71
+ #
72
+ # == Parser Specification:
73
+ # :bookmarks => [ 'ul#bookmarklist/li.post/.bookmark',
74
+ # {
75
+ # :date => hash( '.dateGroup/span',
76
+ # [:year, :month, :day] => regexp( '', /(\d{2}) ([A-Z]{3}) (\d{2})/),
77
+ # ),
78
+ # :title => '.data/h4/a.taggedlink',
79
+ # :url => attr( '.data/h4/a.taggedlink', 'href'),
80
+ # :del_link_url => href( '.data/.savers/a.delNav),
81
+ # :num_savers => to_i( '.data/.savers//span.delNavCount'),
82
+ # :description => '.data/.description',
83
+ # :tags => ['.tagdisplay//tag-chain-item-span']
84
+ # }
85
+ # ]
86
+ #
87
+ # == Example output:
88
+ # { :bookmarks => [
89
+ # { :date => { :year => '08', :month => 'APR', :day => '23' },
90
+ # :title => 'Blog Authorship Corpus (Blogger.com 1994)',
91
+ # :url => 'http://www.cs.biu.ac.il/~koppel/BlogCorpus.htm',
92
+ # :del_link_url => '/url/7df6661946fca61863312644eb071953',
93
+ # :num_savers => 26,
94
+ # :description => 'The Blog ... ',
95
+ # :tags => ['blog', 'corpus', 'analysis', 'nlp', 'dataset'],
96
+ # }
97
+ # ]}
98
+ #
99
+ # == Implementation:
100
+ #
101
+ # Internally, we take the spec and turn it into a recursive structure of Matcher
102
+ # objects. These consume Hpricot Elements and return the appropriately extracted
103
+ # object.
104
+ #
105
+ # Note that the /default/ is for a bare selector to match ONE element, and to not
106
+ # complain if there are many.
107
+ #
108
+ # Missing elements are silently ignored -- for example if
109
+ # :foo => 'li.missing'
110
+ # there will simply be no :foo element in the hash (as opposed to having hsh[:foo]
111
+ # set to nil -- hsh.include?(foo) will be false)
112
+ #
113
+ #
114
+ # == List of Matchers:
115
+ # { :field => /spec/, ... } # hash hash, each field taken from spec.
116
+ # [ "hpricot_path" ] # 1-el array array: for each element matching
117
+ # hpricot_path, the inner_html
118
+ # [ "hpricot_path", /spec/ ] # 2-el array array: for each element matching
119
+ # hpricot_path, pass to spec
120
+ # "hpricot_path" # string same as one("hpricot_path")
121
+ # one("hpricot_path") # one first match to hpricot_path
122
+ # one("hpricot_path", /spec/) # one applies spec to first match to hpricot_path
123
+ # (these all match on one path:)
124
+ # regexp("hpricot_path", /RE/) # regexp capture groups from matching RE against
125
+ # inner_html of first match to hpricot_path
126
+ # attr("hpricot_path", 'attr_name') # attr
127
+ # href("hpricot_path") # href shorthand for attr(foo, 'href')
128
+ # no_html # strip tags from contents
129
+ # html_encoded # html encode contents
130
+ # to_i, to_f, etc # convert
131
+ # lambda{|doc| ... } # proc calls proc on current doc
132
+ #
133
+ # == Complicated HCard example:
134
+ # :hcards => [ '//ul.users/li.vcard',
135
+ # {
136
+ # :name => '.fn',
137
+ # :address => one('.adr',
138
+ # :street => '.street',
139
+ # :city => '.city',
140
+ # :zip => '.postal'
141
+ # )
142
+ # :tel => [ 'span.tel',
143
+ # {
144
+ # :type => 'span.type',
145
+ # [:cc, :area, :num] => hp.regexp('span.value', /+(\d+).(\d{3})-(\d{3}-\d{4})/),
146
+ # }
147
+ # ]
148
+ # :tags => [ '.tag' ],
149
+ # }
150
+ # ]
151
+ #
152
+ # == Resulting Parser
153
+ # MatchHash({:hcards => MatchArray('//ul.users/li.hcard',
154
+ # MatchHash({
155
+ # :name => MatchFirst('.fn'),
156
+ # :address => MatchFirst('.adr',
157
+ # MatchHash({
158
+ # :street => MatchFirst('.street'),
159
+ # :city => MatchFirst('.locality),
160
+ # :state => MatchFirst('.region),
161
+ # :zip => MatchFirst('.postal'),
162
+ # }))
163
+ # :tel => MatchArray('span.tel',
164
+ # MatchHash({
165
+ # :type => MatchFirst('span.type'),
166
+ # [:cc, :area, :num] => RegexpMatcher('span.value', /+(\d+).(\d{3})-(\d{3}-\d{4})/),
167
+ # })
168
+ # )
169
+ # :tags => MatchArray('.tag'),
170
+ # })
171
+ # )
172
+ #
173
+ # == Example output
174
+ # [
175
+ # {:tel => [ {:type => 'home', :cc => '49', :area => '305', :num => '555-1212'},
176
+ # {:type => 'work', :cc => '49', :area => '305', :num => '555-6969'}, ],
177
+ # :name => "Bob Dobbs, Jr.",
178
+ # :tags => ["church"] },
179
+ # {:tel => [ {:type => 'fax', :cc => '49', :area => '305', :num => '867-5309'}, ],
180
+ # :name => "Jenny",
181
+ # :address => { :street => "53 Evergreen Terr.", :city => "Springfield" },
182
+ # :tags => ["bathroom", "wall"] },
183
+ # ]
184
+ #
185
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
186
+ # Copyright:: Copyright (c) 2008 infochimps.org
187
+ # License:: GPL 3.0
188
+ # Website:: http://infinitemonkeywrench.org/
189
+ #
190
+ # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
191
+
192
+ require 'imw/parsers/html_parser/matchers'
193
+
194
+ class IMW::HTMLParser
195
+
196
+ include IMW::HTMLParserMatcher
197
+
198
+ attr_accessor :parse_tree
199
+
200
+ #
201
+ # Parse Tree
202
+ #
203
+ def initialize arg_spec=nil
204
+ spec = arg_spec || self.class.parser_spec
205
+ self.parse_tree = IMW::HTMLParserMatcher.build_parse_tree(spec)
206
+ end
207
+
208
+ #
209
+ # See IMW::HTMLParser for syntax
210
+ #
211
+ #
212
+ def self.parser_spec
213
+ raise "Override this to create your own parser spec"
214
+ end
215
+
216
+ #
217
+ # Walk
218
+ #
219
+ def parse doc
220
+ self.parse_tree.match(doc)
221
+ end
222
+
223
+ # one("hpricot_path") first match to hpricot_path
224
+ # one("hpricot_path", /spec/) applies spec to first match to hpricot_path
225
+ #
226
+ def self.one selector, matcher
227
+ MatchFirstElement.new(selector, IMW::HTMLParserMatcher.build_parse_tree(matcher))
228
+ end
229
+ # match the +attr+ attribute of the first element given by +selector+
230
+ def self.attr selector, attr, matcher=nil
231
+ MatchAttribute.new(selector, attr, IMW::HTMLParserMatcher.build_parse_tree(matcher))
232
+ end
233
+ # shorthand for +attr(foo, 'href')+
234
+ def self.href selector, matcher=nil
235
+ self.attr(selector, 'href', matcher)
236
+ end
237
+ # shorthand for +attr(foo, 'src')+
238
+ def self.src selector, matcher=nil
239
+ self.attr(selector, 'src', matcher)
240
+ end
241
+
242
+ def self.proc selector, proc, matcher=nil
243
+ MatchProc.new(selector, proc, IMW::HTMLParserMatcher.build_parse_tree(matcher))
244
+ end
245
+
246
+ # strip ","s (!! thus disrespecting locale !!!)
247
+ # and convert to int
248
+ def self.to_num selector, matcher=nil
249
+ proc selector, lambda{|num| num.to_s.gsub(/,/,'').to_i if num }, matcher
250
+ end
251
+ def self.to_json selector, matcher=nil
252
+ proc selector, lambda{|v| v.to_json if v }, matcher
253
+ end
254
+
255
+ def self.strip selector, matcher=nil
256
+ proc selector, lambda{|v| v.strip }, matcher
257
+ end
258
+
259
+ def self.re_group selector, re
260
+ MatchRegexp.new(selector, re)
261
+ end
262
+ def self.re selector, re
263
+ MatchRegexp.new(selector, re, nil, :capture => 1)
264
+ end
265
+ def self.re_all selector, re, matcher=nil
266
+ MatchRegexpRepeatedly.new(selector, re)
267
+ end
268
+
269
+ # def self.plain_text selector, matcher=nil
270
+ # proc selector, lambda{|el| el.inner_text if el }, matcher
271
+ # end
272
+
273
+ # attr_accessor :mapping
274
+ #
275
+ # #
276
+ # # Feed me a hash and I'll semantify HTML
277
+ # #
278
+ # # The hash should magically adhere to the too-complicated,
279
+ # # ever evolving goatrope that works for the below
280
+ # #
281
+ # #
282
+ # def initialize mapping
283
+ # self.mapping = mapping
284
+ # end
285
+ #
286
+ # #
287
+ # # take a document subtree,
288
+ # # and a mapping of hpricot paths to that subtree's data mapping
289
+ # # recursively extract that datamapping
290
+ # #
291
+ # def extract_tree hdoc, content, sub_mapping
292
+ # data = { }
293
+ # sub_mapping.each do |selector, target|
294
+ # data[selector] = []
295
+ # sub_contents = content/selector
296
+ # sub_contents.each do |sub_content|
297
+ # sub_data = {}
298
+ # extract_node hdoc, sub_content, sub_data, selector, target
299
+ # data[selector] << sub_data
300
+ # end
301
+ # end
302
+ # data
303
+ # # end
304
+ # # if selector.is_a?(String)
305
+ # # conts = (content)
306
+ # # else
307
+ # # conts = [content]
308
+ # # end
309
+ # # conts[0..0].each do |content|
310
+ # # extract_node hdoc, content, data, selector, target
311
+ # # end
312
+ # # end
313
+ # data
314
+ # end
315
+ #
316
+ # #
317
+ # # insert the extracted element into the data mapping
318
+ # #
319
+ # def extract_node hdoc, content, data, selector, target
320
+ # classification = classify_node(selector, target)
321
+ # result = \
322
+ # case classification
323
+ # when :subtree
324
+ # target.each do |sub_selector, sub_target|
325
+ # extract_node hdoc, content, data, sub_selector, sub_target
326
+ # end
327
+ #
328
+ # when :sub_attribute
329
+ # k, v = selector.to_a[0]
330
+ # subcontent = (k[0..0] == '/') ? (hdoc.at(k)) : (content.at(k))
331
+ # val = subcontent.attributes[v.to_s] if subcontent
332
+ # data[target] = val unless val.blank?
333
+ #
334
+ # when :attribute then
335
+ # val = content.attributes[selector.to_s]
336
+ # data[target] = val unless val.blank?
337
+ #
338
+ # when :flatten_list
339
+ # subcontents = (selector[0..0] == '/') ? (hdoc/selector) : (content/selector)
340
+ # data[target.first] = subcontents.map{|subcontent| subcontent.inner_html }
341
+ #
342
+ # when :inner_html
343
+ # subcontent = (selector[0..0] == '/') ? (hdoc.at(selector)) : (content.at(selector))
344
+ # data[target] = subcontent.inner_html.strip if subcontent
345
+ #
346
+ # else
347
+ # raise "classify_node shouldn't ever return #{classification}"
348
+ # end
349
+ # # puts "%-19s %-19s %-31s %s" % [target.inspect[0..18], classification.inspect[0..18], selector.inspect[0..30], result.inspect[0..90]] if (classification == :sub_attribute)
350
+ # # puts '' if classification == :subtree
351
+ # end
352
+ #
353
+ # def classify_node selector, target
354
+ # case
355
+ # when target.is_a?(Hash) then :subtree
356
+ # when selector.is_a?(Hash) && (selector.length == 1) then
357
+ # k, v = selector.to_a[0]
358
+ # case v
359
+ # when Symbol then :sub_attribute
360
+ # end
361
+ # when selector.is_a?(Symbol) then :attribute
362
+ # when selector.is_a?(String) && target.is_a?(Array) then :flatten_list
363
+ # when selector.is_a?(String) && target.is_a?(Symbol) then :inner_html
364
+ # else
365
+ # raise "Can't classify mapping: " + [selector, target].join(" - ")
366
+ # end
367
+ # end
368
+ #
369
+ # # use #mapping to parse file
370
+ # def parse link
371
+ # begin hdoc = Hpricot(link.contents)
372
+ # rescue; warn "can't hpricot #{link.to_s}" ; return false; end
373
+ # raw_taggings = extract_tree hdoc, hdoc, self.mapping
374
+ # end
375
+ #
376
+ # # use #mapping to parse file
377
+ # def parse_file filename
378
+ # begin hdoc = Hpricot(File.open(filename))
379
+ # rescue; warn "can't hpricot #{filename}" ; return false; end
380
+ # raw_taggings = extract_tree hdoc, hdoc, self.mapping
381
+ # end
382
+ end