imw 0.2.18 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (172) hide show
  1. data/Gemfile +7 -26
  2. data/Gemfile.lock +13 -38
  3. data/{LICENSE → LICENSE.txt} +1 -1
  4. data/README.textile +35 -0
  5. data/Rakefile +45 -22
  6. data/VERSION +1 -1
  7. data/examples/foo.rb +19 -0
  8. data/examples/html_selector.rb +22 -0
  9. data/examples/nes_game_list.csv +625 -0
  10. data/examples/nes_gamespot.csv +1371 -0
  11. data/examples/nes_nintendo.csv +624 -0
  12. data/examples/nes_unlicensed.csv +89 -0
  13. data/examples/nes_wikipedia.csv +710 -0
  14. data/examples/nibbler_test.rb +24 -0
  15. data/examples/script.rb +19 -0
  16. data/lib/imw.rb +28 -140
  17. data/lib/imw/error.rb +9 -0
  18. data/lib/imw/recordizer.rb +8 -0
  19. data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
  20. data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
  21. data/lib/imw/resource.rb +3 -119
  22. data/lib/imw/serializer.rb +7 -0
  23. data/lib/imw/serializer/json_serializer.rb +17 -0
  24. data/lib/imw/uri.rb +41 -0
  25. data/spec/resource_spec.rb +78 -0
  26. data/spec/uri_spec.rb +55 -0
  27. metadata +81 -232
  28. data/README.rdoc +0 -371
  29. data/bin/imw +0 -5
  30. data/bin/tsv_to_json.rb +0 -29
  31. data/etc/imwrc.rb +0 -26
  32. data/examples/dataset.rb +0 -12
  33. data/examples/metadata.yml +0 -10
  34. data/lib/imw/archives.rb +0 -120
  35. data/lib/imw/archives/rar.rb +0 -19
  36. data/lib/imw/archives/tar.rb +0 -19
  37. data/lib/imw/archives/tarbz2.rb +0 -73
  38. data/lib/imw/archives/targz.rb +0 -73
  39. data/lib/imw/archives/zip.rb +0 -51
  40. data/lib/imw/boot.rb +0 -87
  41. data/lib/imw/compressed_files.rb +0 -94
  42. data/lib/imw/compressed_files/bz2.rb +0 -16
  43. data/lib/imw/compressed_files/compressible.rb +0 -75
  44. data/lib/imw/compressed_files/gz.rb +0 -16
  45. data/lib/imw/dataset.rb +0 -125
  46. data/lib/imw/dataset/paths.rb +0 -29
  47. data/lib/imw/dataset/workflow.rb +0 -195
  48. data/lib/imw/formats.rb +0 -33
  49. data/lib/imw/formats/delimited.rb +0 -170
  50. data/lib/imw/formats/excel.rb +0 -100
  51. data/lib/imw/formats/json.rb +0 -41
  52. data/lib/imw/formats/pdf.rb +0 -71
  53. data/lib/imw/formats/sgml.rb +0 -69
  54. data/lib/imw/formats/yaml.rb +0 -41
  55. data/lib/imw/metadata.rb +0 -83
  56. data/lib/imw/metadata/contains_metadata.rb +0 -54
  57. data/lib/imw/metadata/dsl.rb +0 -111
  58. data/lib/imw/metadata/field.rb +0 -37
  59. data/lib/imw/metadata/has_metadata.rb +0 -98
  60. data/lib/imw/metadata/has_summary.rb +0 -57
  61. data/lib/imw/metadata/schema.rb +0 -17
  62. data/lib/imw/parsers.rb +0 -8
  63. data/lib/imw/parsers/flat.rb +0 -44
  64. data/lib/imw/parsers/html_parser.rb +0 -387
  65. data/lib/imw/parsers/html_parser/matchers.rb +0 -289
  66. data/lib/imw/parsers/line_parser.rb +0 -87
  67. data/lib/imw/parsers/regexp_parser.rb +0 -72
  68. data/lib/imw/repository.rb +0 -12
  69. data/lib/imw/runner.rb +0 -118
  70. data/lib/imw/schemes.rb +0 -23
  71. data/lib/imw/schemes/ftp.rb +0 -142
  72. data/lib/imw/schemes/hdfs.rb +0 -251
  73. data/lib/imw/schemes/http.rb +0 -165
  74. data/lib/imw/schemes/local.rb +0 -409
  75. data/lib/imw/schemes/remote.rb +0 -119
  76. data/lib/imw/schemes/s3.rb +0 -143
  77. data/lib/imw/schemes/sql.rb +0 -129
  78. data/lib/imw/tools.rb +0 -12
  79. data/lib/imw/tools/aggregator.rb +0 -148
  80. data/lib/imw/tools/archiver.rb +0 -220
  81. data/lib/imw/tools/downloader.rb +0 -63
  82. data/lib/imw/tools/extension_analyzer.rb +0 -114
  83. data/lib/imw/tools/summarizer.rb +0 -83
  84. data/lib/imw/tools/transferer.rb +0 -167
  85. data/lib/imw/utils.rb +0 -74
  86. data/lib/imw/utils/dynamically_extendable.rb +0 -137
  87. data/lib/imw/utils/error.rb +0 -59
  88. data/lib/imw/utils/extensions/hpricot.rb +0 -34
  89. data/lib/imw/utils/has_uri.rb +0 -131
  90. data/lib/imw/utils/log.rb +0 -92
  91. data/lib/imw/utils/misc.rb +0 -57
  92. data/lib/imw/utils/paths.rb +0 -146
  93. data/lib/imw/utils/uri.rb +0 -59
  94. data/lib/imw/utils/uuid.rb +0 -33
  95. data/lib/imw/utils/validate.rb +0 -38
  96. data/lib/imw/utils/version.rb +0 -11
  97. data/spec/data/formats/delimited/sample.csv +0 -131
  98. data/spec/data/formats/delimited/sample.tsv +0 -131
  99. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
  100. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
  101. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
  102. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
  103. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
  104. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
  105. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
  106. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
  107. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
  108. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
  109. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
  110. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
  111. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
  112. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
  113. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
  114. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
  115. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
  116. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
  117. data/spec/data/formats/excel/sample.xls +0 -0
  118. data/spec/data/formats/json/sample.json +0 -1
  119. data/spec/data/formats/none/sample +0 -650
  120. data/spec/data/formats/sgml/sample.xml +0 -617
  121. data/spec/data/formats/text/sample.txt +0 -650
  122. data/spec/data/formats/yaml/sample.yaml +0 -410
  123. data/spec/data/schema-tabular.yaml +0 -11
  124. data/spec/imw/archives/rar_spec.rb +0 -16
  125. data/spec/imw/archives/tar_spec.rb +0 -16
  126. data/spec/imw/archives/tarbz2_spec.rb +0 -24
  127. data/spec/imw/archives/targz_spec.rb +0 -21
  128. data/spec/imw/archives/zip_spec.rb +0 -16
  129. data/spec/imw/archives_spec.rb +0 -77
  130. data/spec/imw/compressed_files/bz2_spec.rb +0 -15
  131. data/spec/imw/compressed_files/compressible_spec.rb +0 -36
  132. data/spec/imw/compressed_files/gz_spec.rb +0 -15
  133. data/spec/imw/compressed_files_spec.rb +0 -47
  134. data/spec/imw/dataset/paths_spec.rb +0 -32
  135. data/spec/imw/dataset/workflow_spec.rb +0 -41
  136. data/spec/imw/formats/delimited_spec.rb +0 -44
  137. data/spec/imw/formats/excel_spec.rb +0 -55
  138. data/spec/imw/formats/json_spec.rb +0 -18
  139. data/spec/imw/formats/sgml_spec.rb +0 -24
  140. data/spec/imw/formats/yaml_spec.rb +0 -19
  141. data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
  142. data/spec/imw/metadata/field_spec.rb +0 -25
  143. data/spec/imw/metadata/has_metadata_spec.rb +0 -58
  144. data/spec/imw/metadata/has_summary_spec.rb +0 -32
  145. data/spec/imw/metadata/schema_spec.rb +0 -24
  146. data/spec/imw/metadata_spec.rb +0 -86
  147. data/spec/imw/parsers/line_parser_spec.rb +0 -96
  148. data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
  149. data/spec/imw/resource_spec.rb +0 -32
  150. data/spec/imw/schemes/hdfs_spec.rb +0 -67
  151. data/spec/imw/schemes/http_spec.rb +0 -19
  152. data/spec/imw/schemes/local_spec.rb +0 -165
  153. data/spec/imw/schemes/remote_spec.rb +0 -38
  154. data/spec/imw/schemes/s3_spec.rb +0 -31
  155. data/spec/imw/schemes/sql_spec.rb +0 -3
  156. data/spec/imw/tools/aggregator_spec.rb +0 -71
  157. data/spec/imw/tools/archiver_spec.rb +0 -120
  158. data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
  159. data/spec/imw/tools/summarizer_spec.rb +0 -8
  160. data/spec/imw/tools/transferer_spec.rb +0 -195
  161. data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
  162. data/spec/imw/utils/has_uri_spec.rb +0 -61
  163. data/spec/imw/utils/paths_spec.rb +0 -10
  164. data/spec/imw/utils/shared_paths_spec.rb +0 -29
  165. data/spec/imw_spec.rb +0 -14
  166. data/spec/rcov.opts +0 -1
  167. data/spec/spec_helper.rb +0 -31
  168. data/spec/support/custom_matchers.rb +0 -28
  169. data/spec/support/file_contents_matcher.rb +0 -30
  170. data/spec/support/paths_matcher.rb +0 -66
  171. data/spec/support/random.rb +0 -213
  172. data/spec/support/without_regard_to_order_matcher.rb +0 -41
@@ -1,17 +0,0 @@
1
- module IMW
2
-
3
- class Metadata
4
-
5
- # Represents a schema for data.
6
- #
7
- # FIXME add methods that help couple nicely with Avro schemata.
8
- class Schema < Hash
9
-
10
- def initialize obj=nil
11
- super()
12
- merge!(obj) if obj.is_a?(Hash) || obj.is_a?(Schema)
13
- end
14
-
15
- end
16
- end
17
- end
@@ -1,8 +0,0 @@
1
- module IMW
2
- module Parsers
3
- autoload :LineParser, 'imw/parsers/line_parser'
4
- autoload :RegexpParser, 'imw/parsers/regexp_parser'
5
- autoload :HtmlParser, 'imw/parsers/html_parser'
6
- autoload :Flat, 'imw/parsers/flat'
7
- end
8
- end
@@ -1,44 +0,0 @@
1
- module IMW
2
- module Parsers
3
-
4
- class Flat
5
-
6
- attr_accessor :io
7
- attr_accessor :state
8
- attr_accessor :accumulated
9
- attr_accessor :current
10
-
11
- def initialize io
12
- self.io = io
13
- self.state = nil
14
- self.accumulated = []
15
- self.current = nil
16
- end
17
-
18
- def read_next!
19
- self.current = io.readline.chomp
20
- end
21
-
22
- def parse!
23
- while (! complete?)
24
- read_next!
25
- react_to_input!
26
- end
27
- end
28
-
29
- def accumulate!
30
- self.accumulated << current
31
- end
32
-
33
- def complete?
34
- io.eof?
35
- end
36
-
37
- def react_to_input!
38
- raise IMW::NotImplementedError.new("Override the `react_to_input!' method of the #{self.class} class")
39
- end
40
-
41
- end
42
- end
43
- end
44
-
@@ -1,387 +0,0 @@
1
- #
2
- # h2. lib/imw/parsers/html_parser.rb -- html parser
3
- #
4
- # == About
5
- #
6
- # h4. HTML Extractor
7
- #
8
- # * map repeating HTML elements to intermediate ruby data structure
9
- # * optimize all the common cases for expressive brevity
10
- # * output structure will come from HTML structure; map to desired output objects in transform stage.
11
- # * spec shouldn't be allowed to get too much more complicated than this; again, transform stage exists
12
- #
13
- # If this doesn't yield satisfaction you may enjoy
14
- # * http://blog.labnotes.org/2006/07/11/scraping-with-style-scrapi-toolkit-for-ruby/
15
- # * http://scrubyt.org/
16
- # Note of course that these have quite different goals. For example, we don't
17
- # have any interest in "interactive" crawling, eg form submission, or at least
18
- # that goes elsewhere.
19
- #
20
- #
21
- # == Sample HTML (http://twitter.com):
22
- #
23
- # <ul class="about vcard entry-author">
24
- # <li ><span class="label">Name</span> <span class="fn" >MarsPhoenix </span> </li>
25
- # <li ><span class="label">Location</span> <span class="adr">Mars, Solar System</span> </li>
26
- # <li id="bio"><span class="label">Bio</span> <span class="bio">I dig Mars! </span> </li>
27
- # <li ><span class="label">Web</span>
28
- # <a href="http://tinyurl.com/5wwaru" class="url" rel="me nofollow">http://tinyurl.co...</a></li>
29
- # </ul>
30
- #
31
- # == Parser Spec:
32
- # :hcard => m_one('//ul.vcard.about',
33
- # {
34
- # :name => 'li/span.fn',
35
- # :location => 'li/span.adr',
36
- # :url => m_attr('li/a.url[@href]', 'href'),
37
- # :bio => 'li#bio/span.bio',
38
- # }
39
- # )
40
- #
41
- # == Example return:
42
- # { :hcard => { :name => 'Mars Phoenix', :location => 'Mars, Solar System', :bio => 'I dig Mars!', :url => 'http://tinyurl.com/5wwaru' } }
43
- #
44
- # == Sample HTML (http://delicious.com):
45
- # <ul id="bookmarklist" class="bookmarks NOTHUMB">
46
- # <li class="post" id="item-...">
47
- # <div class="bookmark NOTHUMB">
48
- # <div class="dateGroup"> <span title="23 APR 08">23 APR 08</span> </div>
49
- # <div class="data">
50
- # <h4> <a rel="nofollow" class="taggedlink" href="http://www.cs.biu.ac.il/~koppel/BlogCorpus.htm">Blog Authorship Corpus (Blogger.com 1994)</a>
51
- # <a class="inlinesave" href="...">SAVE</a> </h4>
52
- # <h5 class="savers-label"> PEOPLE</h5>
53
- # <div class="savers savers2"> <a class="delNav" href="/url/7df6661946fca61863312644eb071953"><span class="delNavCount">26</span></a> </div>
54
- # <div class="description"> The Blog Authorship Corpus consists of the collected posts of 19,320 bloggers gathered from blogger.com in August 2004. The corpus incorporates a total of 681,288 posts and over 140 million words - or approximately 35 posts and 7250 words per person. </div>
55
- # </div>
56
- # <div class="meta"></div>
57
- # <h5 class="tag-chain-label">TAGS</h5>
58
- # <div class="tagdisplay">
59
- # <ul class="tag-chain">
60
- # <li class="tag-chain-item off first"><a class="tag-chain-item-link" rel="tag" href="/infochimps/blog" ><span class="tag-chain-item-span">blog</span> </a></li>
61
- # <li class="tag-chain-item off"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/corpus" ><span class="tag-chain-item-span">corpus</span> </a></li>
62
- # <li class="tag-chain-item off"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/analysis" ><span class="tag-chain-item-span">analysis</span></a></li>
63
- # <li class="tag-chain-item off"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/nlp" ><span class="tag-chain-item-span">nlp</span> </a></li>
64
- # <li class="tag-chain-item on last"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/dataset" ><span class="tag-chain-item-span">dataset</span> </a></li>
65
- # </ul>
66
- # </div>
67
- # <div class="clr"></div>
68
- # </div>
69
- # </li>
70
- # </ul>
71
- #
72
- # == Parser Specification:
73
- # :bookmarks => [ 'ul#bookmarklist/li.post/.bookmark',
74
- # {
75
- # :date => hash( '.dateGroup/span',
76
- # [:year, :month, :day] => regexp( '', /(\d{2}) ([A-Z]{3}) (\d{2})/),
77
- # ),
78
- # :title => '.data/h4/a.taggedlink',
79
- # :url => attr( '.data/h4/a.taggedlink', 'href'),
80
- # :del_link_url => href( '.data/.savers/a.delNav),
81
- # :num_savers => to_i( '.data/.savers//span.delNavCount'),
82
- # :description => '.data/.description',
83
- # :tags => ['.tagdisplay//tag-chain-item-span']
84
- # }
85
- # ]
86
- #
87
- # == Example output:
88
- # { :bookmarks => [
89
- # { :date => { :year => '08', :month => 'APR', :day => '23' },
90
- # :title => 'Blog Authorship Corpus (Blogger.com 1994)',
91
- # :url => 'http://www.cs.biu.ac.il/~koppel/BlogCorpus.htm',
92
- # :del_link_url => '/url/7df6661946fca61863312644eb071953',
93
- # :num_savers => 26,
94
- # :description => 'The Blog ... ',
95
- # :tags => ['blog', 'corpus', 'analysis', 'nlp', 'dataset'],
96
- # }
97
- # ]}
98
- #
99
- # == Implementation:
100
- #
101
- # Internally, we take the spec and turn it into a recursive structure of Matcher
102
- # objects. These consume Hpricot Elements and return the appropriately extracted
103
- # object.
104
- #
105
- # Note that the /default/ is for a bare selector to match ONE element, and to not
106
- # complain if there are many.
107
- #
108
- # Missing elements are silently ignored -- for example if
109
- # :foo => 'li.missing'
110
- # there will simply be no :foo element in the hash (as opposed to having hsh[:foo]
111
- # set to nil -- hsh.include?(foo) will be false)
112
- #
113
- #
114
- # == List of Matchers:
115
- # { :field => /spec/, ... } # hash hash, each field taken from spec.
116
- # [ "hpricot_path" ] # 1-el array array: for each element matching
117
- # hpricot_path, the inner_html
118
- # [ "hpricot_path", /spec/ ] # 2-el array array: for each element matching
119
- # hpricot_path, pass to spec
120
- # "hpricot_path" # string same as one("hpricot_path")
121
- # one("hpricot_path") # one first match to hpricot_path
122
- # one("hpricot_path", /spec/) # one applies spec to first match to hpricot_path
123
- # (these all match on one path:)
124
- # regexp("hpricot_path", /RE/) # regexp capture groups from matching RE against
125
- # inner_html of first match to hpricot_path
126
- # attr("hpricot_path", 'attr_name') # attr
127
- # href("hpricot_path") # href shorthand for attr(foo, 'href')
128
- # no_html # strip tags from contents
129
- # html_encoded # html encode contents
130
- # to_i, to_f, etc # convert
131
- # lambda{|doc| ... } # proc calls proc on current doc
132
- #
133
- # == Complicated HCard example:
134
- # :hcards => [ '//ul.users/li.vcard',
135
- # {
136
- # :name => '.fn',
137
- # :address => one('.adr',
138
- # :street => '.street',
139
- # :city => '.city',
140
- # :zip => '.postal'
141
- # )
142
- # :tel => [ 'span.tel',
143
- # {
144
- # :type => 'span.type',
145
- # [:cc, :area, :num] => hp.regexp('span.value', /+(\d+).(\d{3})-(\d{3}-\d{4})/),
146
- # }
147
- # ]
148
- # :tags => [ '.tag' ],
149
- # }
150
- # ]
151
- #
152
- # == Resulting Parser
153
- # MatchHash({:hcards => MatchArray('//ul.users/li.hcard',
154
- # MatchHash({
155
- # :name => MatchFirst('.fn'),
156
- # :address => MatchFirst('.adr',
157
- # MatchHash({
158
- # :street => MatchFirst('.street'),
159
- # :city => MatchFirst('.locality),
160
- # :state => MatchFirst('.region),
161
- # :zip => MatchFirst('.postal'),
162
- # }))
163
- # :tel => MatchArray('span.tel',
164
- # MatchHash({
165
- # :type => MatchFirst('span.type'),
166
- # [:cc, :area, :num] => RegexpMatcher('span.value', /+(\d+).(\d{3})-(\d{3}-\d{4})/),
167
- # })
168
- # )
169
- # :tags => MatchArray('.tag'),
170
- # })
171
- # )
172
- #
173
- # == Example output
174
- # [
175
- # {:tel => [ {:type => 'home', :cc => '49', :area => '305', :num => '555-1212'},
176
- # {:type => 'work', :cc => '49', :area => '305', :num => '555-6969'}, ],
177
- # :name => "Bob Dobbs, Jr.",
178
- # :tags => ["church"] },
179
- # {:tel => [ {:type => 'fax', :cc => '49', :area => '305', :num => '867-5309'}, ],
180
- # :name => "Jenny",
181
- # :address => { :street => "53 Evergreen Terr.", :city => "Springfield" },
182
- # :tags => ["bathroom", "wall"] },
183
- # ]
184
- #
185
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
186
- # Copyright:: Copyright (c) 2008 infochimps.org
187
- # License:: GPL 3.0
188
- # Website:: http://infinitemonkeywrench.org/
189
- #
190
- # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
191
-
192
- require 'imw/parsers/html_parser/matchers'
193
-
194
- module IMW
195
- module Parsers
196
- class HtmlParser
197
-
198
- include IMW::Parsers::HtmlMatchers
199
-
200
- attr_accessor :parse_tree
201
-
202
- #
203
- # Parse Tree
204
- #
205
- def initialize arg_spec=nil
206
- spec = arg_spec || self.class.parser_spec
207
- self.parse_tree = IMW::Parsers::HtmlMatchers.build_parse_tree(spec)
208
- end
209
-
210
- #
211
- # See IMW::HtmlParser for syntax
212
- #
213
- #
214
- def self.parser_spec
215
- raise "Override this to create your own parser spec"
216
- end
217
-
218
- #
219
- # Walk
220
- #
221
- def parse doc
222
- self.parse_tree.match(doc)
223
- end
224
-
225
- # one("hpricot_path") first match to hpricot_path
226
- # one("hpricot_path", /spec/) applies spec to first match to hpricot_path
227
- #
228
- def self.one selector, matcher
229
- MatchFirstElement.new(selector, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
230
- end
231
- # match the +attr+ attribute of the first element given by +selector+
232
- def self.attr selector, attr, matcher=nil
233
- MatchAttribute.new(selector, attr, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
234
- end
235
- # shorthand for +attr(foo, 'href')+
236
- def self.href selector, matcher=nil
237
- self.attr(selector, 'href', matcher)
238
- end
239
- # shorthand for +attr(foo, 'src')+
240
- def self.src selector, matcher=nil
241
- self.attr(selector, 'src', matcher)
242
- end
243
-
244
- def self.proc selector, proc, matcher=nil
245
- MatchProc.new(selector, proc, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
246
- end
247
-
248
- # strip ","s (!! thus disrespecting locale !!!)
249
- # and convert to int
250
- def self.to_num selector, matcher=nil
251
- proc selector, lambda{|num| num.to_s.gsub(/,/,'').to_i if num }, matcher
252
- end
253
- def self.to_json selector, matcher=nil
254
- proc selector, lambda{|v| v.to_json if v }, matcher
255
- end
256
-
257
- def self.strip selector, matcher=nil
258
- proc selector, lambda{|v| v.strip }, matcher
259
- end
260
-
261
- def self.re_group selector, re
262
- MatchRegexp.new(selector, re)
263
- end
264
- def self.re selector, re
265
- MatchRegexp.new(selector, re, nil, :capture => 1)
266
- end
267
- def self.re_all selector, re, matcher=nil
268
- MatchRegexpRepeatedly.new(selector, re)
269
- end
270
-
271
- # def self.plain_text selector, matcher=nil
272
- # proc selector, lambda{|el| el.inner_text if el }, matcher
273
- # end
274
-
275
- # attr_accessor :mapping
276
- #
277
- # #
278
- # # Feed me a hash and I'll semantify HTML
279
- # #
280
- # # The hash should magically adhere to the too-complicated,
281
- # # ever evolving goatrope that works for the below
282
- # #
283
- # #
284
- # def initialize mapping
285
- # self.mapping = mapping
286
- # end
287
- #
288
- # #
289
- # # take a document subtree,
290
- # # and a mapping of hpricot paths to that subtree's data mapping
291
- # # recursively extract that datamapping
292
- # #
293
- # def extract_tree hdoc, content, sub_mapping
294
- # data = { }
295
- # sub_mapping.each do |selector, target|
296
- # data[selector] = []
297
- # sub_contents = content/selector
298
- # sub_contents.each do |sub_content|
299
- # sub_data = {}
300
- # extract_node hdoc, sub_content, sub_data, selector, target
301
- # data[selector] << sub_data
302
- # end
303
- # end
304
- # data
305
- # # end
306
- # # if selector.is_a?(String)
307
- # # conts = (content)
308
- # # else
309
- # # conts = [content]
310
- # # end
311
- # # conts[0..0].each do |content|
312
- # # extract_node hdoc, content, data, selector, target
313
- # # end
314
- # # end
315
- # data
316
- # end
317
- #
318
- # #
319
- # # insert the extracted element into the data mapping
320
- # #
321
- # def extract_node hdoc, content, data, selector, target
322
- # classification = classify_node(selector, target)
323
- # result = \
324
- # case classification
325
- # when :subtree
326
- # target.each do |sub_selector, sub_target|
327
- # extract_node hdoc, content, data, sub_selector, sub_target
328
- # end
329
- #
330
- # when :sub_attribute
331
- # k, v = selector.to_a[0]
332
- # subcontent = (k[0..0] == '/') ? (hdoc.at(k)) : (content.at(k))
333
- # val = subcontent.attributes[v.to_s] if subcontent
334
- # data[target] = val unless val.blank?
335
- #
336
- # when :attribute then
337
- # val = content.attributes[selector.to_s]
338
- # data[target] = val unless val.blank?
339
- #
340
- # when :flatten_list
341
- # subcontents = (selector[0..0] == '/') ? (hdoc/selector) : (content/selector)
342
- # data[target.first] = subcontents.map{|subcontent| subcontent.inner_html }
343
- #
344
- # when :inner_html
345
- # subcontent = (selector[0..0] == '/') ? (hdoc.at(selector)) : (content.at(selector))
346
- # data[target] = subcontent.inner_html.strip if subcontent
347
- #
348
- # else
349
- # raise "classify_node shouldn't ever return #{classification}"
350
- # end
351
- # # puts "%-19s %-19s %-31s %s" % [target.inspect[0..18], classification.inspect[0..18], selector.inspect[0..30], result.inspect[0..90]] if (classification == :sub_attribute)
352
- # # puts '' if classification == :subtree
353
- # end
354
- #
355
- # def classify_node selector, target
356
- # case
357
- # when target.is_a?(Hash) then :subtree
358
- # when selector.is_a?(Hash) && (selector.length == 1) then
359
- # k, v = selector.to_a[0]
360
- # case v
361
- # when Symbol then :sub_attribute
362
- # end
363
- # when selector.is_a?(Symbol) then :attribute
364
- # when selector.is_a?(String) && target.is_a?(Array) then :flatten_list
365
- # when selector.is_a?(String) && target.is_a?(Symbol) then :inner_html
366
- # else
367
- # raise "Can't classify mapping: " + [selector, target].join(" - ")
368
- # end
369
- # end
370
- #
371
- # # use #mapping to parse file
372
- # def parse link
373
- # begin hdoc = Hpricot(link.contents)
374
- # rescue; warn "can't hpricot #{link.to_s}" ; return false; end
375
- # raw_taggings = extract_tree hdoc, hdoc, self.mapping
376
- # end
377
- #
378
- # # use #mapping to parse file
379
- # def parse_file filename
380
- # begin hdoc = Hpricot(File.open(filename))
381
- # rescue; warn "can't hpricot #{filename}" ; return false; end
382
- # raw_taggings = extract_tree hdoc, hdoc, self.mapping
383
- # end
384
- end
385
- end
386
- end
387
-