imw 0.2.18 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. data/Gemfile +7 -26
  2. data/Gemfile.lock +13 -38
  3. data/{LICENSE → LICENSE.txt} +1 -1
  4. data/README.textile +35 -0
  5. data/Rakefile +45 -22
  6. data/VERSION +1 -1
  7. data/examples/foo.rb +19 -0
  8. data/examples/html_selector.rb +22 -0
  9. data/examples/nes_game_list.csv +625 -0
  10. data/examples/nes_gamespot.csv +1371 -0
  11. data/examples/nes_nintendo.csv +624 -0
  12. data/examples/nes_unlicensed.csv +89 -0
  13. data/examples/nes_wikipedia.csv +710 -0
  14. data/examples/nibbler_test.rb +24 -0
  15. data/examples/script.rb +19 -0
  16. data/lib/imw.rb +28 -140
  17. data/lib/imw/error.rb +9 -0
  18. data/lib/imw/recordizer.rb +8 -0
  19. data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
  20. data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
  21. data/lib/imw/resource.rb +3 -119
  22. data/lib/imw/serializer.rb +7 -0
  23. data/lib/imw/serializer/json_serializer.rb +17 -0
  24. data/lib/imw/uri.rb +41 -0
  25. data/spec/resource_spec.rb +78 -0
  26. data/spec/uri_spec.rb +55 -0
  27. metadata +81 -232
  28. data/README.rdoc +0 -371
  29. data/bin/imw +0 -5
  30. data/bin/tsv_to_json.rb +0 -29
  31. data/etc/imwrc.rb +0 -26
  32. data/examples/dataset.rb +0 -12
  33. data/examples/metadata.yml +0 -10
  34. data/lib/imw/archives.rb +0 -120
  35. data/lib/imw/archives/rar.rb +0 -19
  36. data/lib/imw/archives/tar.rb +0 -19
  37. data/lib/imw/archives/tarbz2.rb +0 -73
  38. data/lib/imw/archives/targz.rb +0 -73
  39. data/lib/imw/archives/zip.rb +0 -51
  40. data/lib/imw/boot.rb +0 -87
  41. data/lib/imw/compressed_files.rb +0 -94
  42. data/lib/imw/compressed_files/bz2.rb +0 -16
  43. data/lib/imw/compressed_files/compressible.rb +0 -75
  44. data/lib/imw/compressed_files/gz.rb +0 -16
  45. data/lib/imw/dataset.rb +0 -125
  46. data/lib/imw/dataset/paths.rb +0 -29
  47. data/lib/imw/dataset/workflow.rb +0 -195
  48. data/lib/imw/formats.rb +0 -33
  49. data/lib/imw/formats/delimited.rb +0 -170
  50. data/lib/imw/formats/excel.rb +0 -100
  51. data/lib/imw/formats/json.rb +0 -41
  52. data/lib/imw/formats/pdf.rb +0 -71
  53. data/lib/imw/formats/sgml.rb +0 -69
  54. data/lib/imw/formats/yaml.rb +0 -41
  55. data/lib/imw/metadata.rb +0 -83
  56. data/lib/imw/metadata/contains_metadata.rb +0 -54
  57. data/lib/imw/metadata/dsl.rb +0 -111
  58. data/lib/imw/metadata/field.rb +0 -37
  59. data/lib/imw/metadata/has_metadata.rb +0 -98
  60. data/lib/imw/metadata/has_summary.rb +0 -57
  61. data/lib/imw/metadata/schema.rb +0 -17
  62. data/lib/imw/parsers.rb +0 -8
  63. data/lib/imw/parsers/flat.rb +0 -44
  64. data/lib/imw/parsers/html_parser.rb +0 -387
  65. data/lib/imw/parsers/html_parser/matchers.rb +0 -289
  66. data/lib/imw/parsers/line_parser.rb +0 -87
  67. data/lib/imw/parsers/regexp_parser.rb +0 -72
  68. data/lib/imw/repository.rb +0 -12
  69. data/lib/imw/runner.rb +0 -118
  70. data/lib/imw/schemes.rb +0 -23
  71. data/lib/imw/schemes/ftp.rb +0 -142
  72. data/lib/imw/schemes/hdfs.rb +0 -251
  73. data/lib/imw/schemes/http.rb +0 -165
  74. data/lib/imw/schemes/local.rb +0 -409
  75. data/lib/imw/schemes/remote.rb +0 -119
  76. data/lib/imw/schemes/s3.rb +0 -143
  77. data/lib/imw/schemes/sql.rb +0 -129
  78. data/lib/imw/tools.rb +0 -12
  79. data/lib/imw/tools/aggregator.rb +0 -148
  80. data/lib/imw/tools/archiver.rb +0 -220
  81. data/lib/imw/tools/downloader.rb +0 -63
  82. data/lib/imw/tools/extension_analyzer.rb +0 -114
  83. data/lib/imw/tools/summarizer.rb +0 -83
  84. data/lib/imw/tools/transferer.rb +0 -167
  85. data/lib/imw/utils.rb +0 -74
  86. data/lib/imw/utils/dynamically_extendable.rb +0 -137
  87. data/lib/imw/utils/error.rb +0 -59
  88. data/lib/imw/utils/extensions/hpricot.rb +0 -34
  89. data/lib/imw/utils/has_uri.rb +0 -131
  90. data/lib/imw/utils/log.rb +0 -92
  91. data/lib/imw/utils/misc.rb +0 -57
  92. data/lib/imw/utils/paths.rb +0 -146
  93. data/lib/imw/utils/uri.rb +0 -59
  94. data/lib/imw/utils/uuid.rb +0 -33
  95. data/lib/imw/utils/validate.rb +0 -38
  96. data/lib/imw/utils/version.rb +0 -11
  97. data/spec/data/formats/delimited/sample.csv +0 -131
  98. data/spec/data/formats/delimited/sample.tsv +0 -131
  99. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
  100. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
  101. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
  102. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
  103. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
  104. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
  105. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
  106. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
  107. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
  108. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
  109. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
  110. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
  111. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
  112. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
  113. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
  114. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
  115. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
  116. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
  117. data/spec/data/formats/excel/sample.xls +0 -0
  118. data/spec/data/formats/json/sample.json +0 -1
  119. data/spec/data/formats/none/sample +0 -650
  120. data/spec/data/formats/sgml/sample.xml +0 -617
  121. data/spec/data/formats/text/sample.txt +0 -650
  122. data/spec/data/formats/yaml/sample.yaml +0 -410
  123. data/spec/data/schema-tabular.yaml +0 -11
  124. data/spec/imw/archives/rar_spec.rb +0 -16
  125. data/spec/imw/archives/tar_spec.rb +0 -16
  126. data/spec/imw/archives/tarbz2_spec.rb +0 -24
  127. data/spec/imw/archives/targz_spec.rb +0 -21
  128. data/spec/imw/archives/zip_spec.rb +0 -16
  129. data/spec/imw/archives_spec.rb +0 -77
  130. data/spec/imw/compressed_files/bz2_spec.rb +0 -15
  131. data/spec/imw/compressed_files/compressible_spec.rb +0 -36
  132. data/spec/imw/compressed_files/gz_spec.rb +0 -15
  133. data/spec/imw/compressed_files_spec.rb +0 -47
  134. data/spec/imw/dataset/paths_spec.rb +0 -32
  135. data/spec/imw/dataset/workflow_spec.rb +0 -41
  136. data/spec/imw/formats/delimited_spec.rb +0 -44
  137. data/spec/imw/formats/excel_spec.rb +0 -55
  138. data/spec/imw/formats/json_spec.rb +0 -18
  139. data/spec/imw/formats/sgml_spec.rb +0 -24
  140. data/spec/imw/formats/yaml_spec.rb +0 -19
  141. data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
  142. data/spec/imw/metadata/field_spec.rb +0 -25
  143. data/spec/imw/metadata/has_metadata_spec.rb +0 -58
  144. data/spec/imw/metadata/has_summary_spec.rb +0 -32
  145. data/spec/imw/metadata/schema_spec.rb +0 -24
  146. data/spec/imw/metadata_spec.rb +0 -86
  147. data/spec/imw/parsers/line_parser_spec.rb +0 -96
  148. data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
  149. data/spec/imw/resource_spec.rb +0 -32
  150. data/spec/imw/schemes/hdfs_spec.rb +0 -67
  151. data/spec/imw/schemes/http_spec.rb +0 -19
  152. data/spec/imw/schemes/local_spec.rb +0 -165
  153. data/spec/imw/schemes/remote_spec.rb +0 -38
  154. data/spec/imw/schemes/s3_spec.rb +0 -31
  155. data/spec/imw/schemes/sql_spec.rb +0 -3
  156. data/spec/imw/tools/aggregator_spec.rb +0 -71
  157. data/spec/imw/tools/archiver_spec.rb +0 -120
  158. data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
  159. data/spec/imw/tools/summarizer_spec.rb +0 -8
  160. data/spec/imw/tools/transferer_spec.rb +0 -195
  161. data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
  162. data/spec/imw/utils/has_uri_spec.rb +0 -61
  163. data/spec/imw/utils/paths_spec.rb +0 -10
  164. data/spec/imw/utils/shared_paths_spec.rb +0 -29
  165. data/spec/imw_spec.rb +0 -14
  166. data/spec/rcov.opts +0 -1
  167. data/spec/spec_helper.rb +0 -31
  168. data/spec/support/custom_matchers.rb +0 -28
  169. data/spec/support/file_contents_matcher.rb +0 -30
  170. data/spec/support/paths_matcher.rb +0 -66
  171. data/spec/support/random.rb +0 -213
  172. data/spec/support/without_regard_to_order_matcher.rb +0 -41
@@ -1,17 +0,0 @@
1
- module IMW
2
-
3
- class Metadata
4
-
5
- # Represents a schema for data.
6
- #
7
- # FIXME add methods that help couple nicely with Avro schemata.
8
- class Schema < Hash
9
-
10
- def initialize obj=nil
11
- super()
12
- merge!(obj) if obj.is_a?(Hash) || obj.is_a?(Schema)
13
- end
14
-
15
- end
16
- end
17
- end
@@ -1,8 +0,0 @@
1
- module IMW
2
- module Parsers
3
- autoload :LineParser, 'imw/parsers/line_parser'
4
- autoload :RegexpParser, 'imw/parsers/regexp_parser'
5
- autoload :HtmlParser, 'imw/parsers/html_parser'
6
- autoload :Flat, 'imw/parsers/flat'
7
- end
8
- end
@@ -1,44 +0,0 @@
1
- module IMW
2
- module Parsers
3
-
4
- class Flat
5
-
6
- attr_accessor :io
7
- attr_accessor :state
8
- attr_accessor :accumulated
9
- attr_accessor :current
10
-
11
- def initialize io
12
- self.io = io
13
- self.state = nil
14
- self.accumulated = []
15
- self.current = nil
16
- end
17
-
18
- def read_next!
19
- self.current = io.readline.chomp
20
- end
21
-
22
- def parse!
23
- while (! complete?)
24
- read_next!
25
- react_to_input!
26
- end
27
- end
28
-
29
- def accumulate!
30
- self.accumulated << current
31
- end
32
-
33
- def complete?
34
- io.eof?
35
- end
36
-
37
- def react_to_input!
38
- raise IMW::NotImplementedError.new("Override the `react_to_input!' method of the #{self.class} class")
39
- end
40
-
41
- end
42
- end
43
- end
44
-
@@ -1,387 +0,0 @@
1
- #
2
- # h2. lib/imw/parsers/html_parser.rb -- html parser
3
- #
4
- # == About
5
- #
6
- # h4. HTML Extractor
7
- #
8
- # * map repeating HTML elements to intermediate ruby data structure
9
- # * optimize all the common cases for expressive brevity
10
- # * output structure will come from HTML structure; map to desired output objects in transform stage.
11
- # * spec shouldn't be allowed to get too much more complicated than this; again, transform stage exists
12
- #
13
- # If this doesn't yield satisfaction you may enjoy
14
- # * http://blog.labnotes.org/2006/07/11/scraping-with-style-scrapi-toolkit-for-ruby/
15
- # * http://scrubyt.org/
16
- # Note of course that these have quite different goals. For example, we don't
17
- # have any interest in "interactive" crawling, eg form submission, or at least
18
- # that goes elsewhere.
19
- #
20
- #
21
- # == Sample HTML (http://twitter.com):
22
- #
23
- # <ul class="about vcard entry-author">
24
- # <li ><span class="label">Name</span> <span class="fn" >MarsPhoenix </span> </li>
25
- # <li ><span class="label">Location</span> <span class="adr">Mars, Solar System</span> </li>
26
- # <li id="bio"><span class="label">Bio</span> <span class="bio">I dig Mars! </span> </li>
27
- # <li ><span class="label">Web</span>
28
- # <a href="http://tinyurl.com/5wwaru" class="url" rel="me nofollow">http://tinyurl.co...</a></li>
29
- # </ul>
30
- #
31
- # == Parser Spec:
32
- # :hcard => m_one('//ul.vcard.about',
33
- # {
34
- # :name => 'li/span.fn',
35
- # :location => 'li/span.adr',
36
- # :url => m_attr('li/a.url[@href]', 'href'),
37
- # :bio => 'li#bio/span.bio',
38
- # }
39
- # )
40
- #
41
- # == Example return:
42
- # { :hcard => { :name => 'Mars Phoenix', :location => 'Mars, Solar System', :bio => 'I dig Mars!', :url => 'http://tinyurl.com/5wwaru' } }
43
- #
44
- # == Sample HTML (http://delicious.com):
45
- # <ul id="bookmarklist" class="bookmarks NOTHUMB">
46
- # <li class="post" id="item-...">
47
- # <div class="bookmark NOTHUMB">
48
- # <div class="dateGroup"> <span title="23 APR 08">23 APR 08</span> </div>
49
- # <div class="data">
50
- # <h4> <a rel="nofollow" class="taggedlink" href="http://www.cs.biu.ac.il/~koppel/BlogCorpus.htm">Blog Authorship Corpus (Blogger.com 1994)</a>
51
- # <a class="inlinesave" href="...">SAVE</a> </h4>
52
- # <h5 class="savers-label"> PEOPLE</h5>
53
- # <div class="savers savers2"> <a class="delNav" href="/url/7df6661946fca61863312644eb071953"><span class="delNavCount">26</span></a> </div>
54
- # <div class="description"> The Blog Authorship Corpus consists of the collected posts of 19,320 bloggers gathered from blogger.com in August 2004. The corpus incorporates a total of 681,288 posts and over 140 million words - or approximately 35 posts and 7250 words per person. </div>
55
- # </div>
56
- # <div class="meta"></div>
57
- # <h5 class="tag-chain-label">TAGS</h5>
58
- # <div class="tagdisplay">
59
- # <ul class="tag-chain">
60
- # <li class="tag-chain-item off first"><a class="tag-chain-item-link" rel="tag" href="/infochimps/blog" ><span class="tag-chain-item-span">blog</span> </a></li>
61
- # <li class="tag-chain-item off"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/corpus" ><span class="tag-chain-item-span">corpus</span> </a></li>
62
- # <li class="tag-chain-item off"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/analysis" ><span class="tag-chain-item-span">analysis</span></a></li>
63
- # <li class="tag-chain-item off"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/nlp" ><span class="tag-chain-item-span">nlp</span> </a></li>
64
- # <li class="tag-chain-item on last"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/dataset" ><span class="tag-chain-item-span">dataset</span> </a></li>
65
- # </ul>
66
- # </div>
67
- # <div class="clr"></div>
68
- # </div>
69
- # </li>
70
- # </ul>
71
- #
72
- # == Parser Specification:
73
- # :bookmarks => [ 'ul#bookmarklist/li.post/.bookmark',
74
- # {
75
- # :date => hash( '.dateGroup/span',
76
- # [:year, :month, :day] => regexp( '', /(\d{2}) ([A-Z]{3}) (\d{2})/),
77
- # ),
78
- # :title => '.data/h4/a.taggedlink',
79
- # :url => attr( '.data/h4/a.taggedlink', 'href'),
80
- # :del_link_url => href( '.data/.savers/a.delNav),
81
- # :num_savers => to_i( '.data/.savers//span.delNavCount'),
82
- # :description => '.data/.description',
83
- # :tags => ['.tagdisplay//tag-chain-item-span']
84
- # }
85
- # ]
86
- #
87
- # == Example output:
88
- # { :bookmarks => [
89
- # { :date => { :year => '08', :month => 'APR', :day => '23' },
90
- # :title => 'Blog Authorship Corpus (Blogger.com 1994)',
91
- # :url => 'http://www.cs.biu.ac.il/~koppel/BlogCorpus.htm',
92
- # :del_link_url => '/url/7df6661946fca61863312644eb071953',
93
- # :num_savers => 26,
94
- # :description => 'The Blog ... ',
95
- # :tags => ['blog', 'corpus', 'analysis', 'nlp', 'dataset'],
96
- # }
97
- # ]}
98
- #
99
- # == Implementation:
100
- #
101
- # Internally, we take the spec and turn it into a recursive structure of Matcher
102
- # objects. These consume Hpricot Elements and return the appropriately extracted
103
- # object.
104
- #
105
- # Note that the /default/ is for a bare selector to match ONE element, and to not
106
- # complain if there are many.
107
- #
108
- # Missing elements are silently ignored -- for example if
109
- # :foo => 'li.missing'
110
- # there will simply be no :foo element in the hash (as opposed to having hsh[:foo]
111
- # set to nil -- hsh.include?(foo) will be false)
112
- #
113
- #
114
- # == List of Matchers:
115
- # { :field => /spec/, ... } # hash hash, each field taken from spec.
116
- # [ "hpricot_path" ] # 1-el array array: for each element matching
117
- # hpricot_path, the inner_html
118
- # [ "hpricot_path", /spec/ ] # 2-el array array: for each element matching
119
- # hpricot_path, pass to spec
120
- # "hpricot_path" # string same as one("hpricot_path")
121
- # one("hpricot_path") # one first match to hpricot_path
122
- # one("hpricot_path", /spec/) # one applies spec to first match to hpricot_path
123
- # (these all match on one path:)
124
- # regexp("hpricot_path", /RE/) # regexp capture groups from matching RE against
125
- # inner_html of first match to hpricot_path
126
- # attr("hpricot_path", 'attr_name') # attr
127
- # href("hpricot_path") # href shorthand for attr(foo, 'href')
128
- # no_html # strip tags from contents
129
- # html_encoded # html encode contents
130
- # to_i, to_f, etc # convert
131
- # lambda{|doc| ... } # proc calls proc on current doc
132
- #
133
- # == Complicated HCard example:
134
- # :hcards => [ '//ul.users/li.vcard',
135
- # {
136
- # :name => '.fn',
137
- # :address => one('.adr',
138
- # :street => '.street',
139
- # :city => '.city',
140
- # :zip => '.postal'
141
- # )
142
- # :tel => [ 'span.tel',
143
- # {
144
- # :type => 'span.type',
145
- # [:cc, :area, :num] => hp.regexp('span.value', /+(\d+).(\d{3})-(\d{3}-\d{4})/),
146
- # }
147
- # ]
148
- # :tags => [ '.tag' ],
149
- # }
150
- # ]
151
- #
152
- # == Resulting Parser
153
- # MatchHash({:hcards => MatchArray('//ul.users/li.hcard',
154
- # MatchHash({
155
- # :name => MatchFirst('.fn'),
156
- # :address => MatchFirst('.adr',
157
- # MatchHash({
158
- # :street => MatchFirst('.street'),
159
- # :city => MatchFirst('.locality),
160
- # :state => MatchFirst('.region),
161
- # :zip => MatchFirst('.postal'),
162
- # }))
163
- # :tel => MatchArray('span.tel',
164
- # MatchHash({
165
- # :type => MatchFirst('span.type'),
166
- # [:cc, :area, :num] => RegexpMatcher('span.value', /+(\d+).(\d{3})-(\d{3}-\d{4})/),
167
- # })
168
- # )
169
- # :tags => MatchArray('.tag'),
170
- # })
171
- # )
172
- #
173
- # == Example output
174
- # [
175
- # {:tel => [ {:type => 'home', :cc => '49', :area => '305', :num => '555-1212'},
176
- # {:type => 'work', :cc => '49', :area => '305', :num => '555-6969'}, ],
177
- # :name => "Bob Dobbs, Jr.",
178
- # :tags => ["church"] },
179
- # {:tel => [ {:type => 'fax', :cc => '49', :area => '305', :num => '867-5309'}, ],
180
- # :name => "Jenny",
181
- # :address => { :street => "53 Evergreen Terr.", :city => "Springfield" },
182
- # :tags => ["bathroom", "wall"] },
183
- # ]
184
- #
185
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
186
- # Copyright:: Copyright (c) 2008 infochimps.org
187
- # License:: GPL 3.0
188
- # Website:: http://infinitemonkeywrench.org/
189
- #
190
- # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
191
-
192
- require 'imw/parsers/html_parser/matchers'
193
-
194
- module IMW
195
- module Parsers
196
- class HtmlParser
197
-
198
- include IMW::Parsers::HtmlMatchers
199
-
200
- attr_accessor :parse_tree
201
-
202
- #
203
- # Parse Tree
204
- #
205
- def initialize arg_spec=nil
206
- spec = arg_spec || self.class.parser_spec
207
- self.parse_tree = IMW::Parsers::HtmlMatchers.build_parse_tree(spec)
208
- end
209
-
210
- #
211
- # See IMW::HtmlParser for syntax
212
- #
213
- #
214
- def self.parser_spec
215
- raise "Override this to create your own parser spec"
216
- end
217
-
218
- #
219
- # Walk
220
- #
221
- def parse doc
222
- self.parse_tree.match(doc)
223
- end
224
-
225
- # one("hpricot_path") first match to hpricot_path
226
- # one("hpricot_path", /spec/) applies spec to first match to hpricot_path
227
- #
228
- def self.one selector, matcher
229
- MatchFirstElement.new(selector, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
230
- end
231
- # match the +attr+ attribute of the first element given by +selector+
232
- def self.attr selector, attr, matcher=nil
233
- MatchAttribute.new(selector, attr, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
234
- end
235
- # shorthand for +attr(foo, 'href')+
236
- def self.href selector, matcher=nil
237
- self.attr(selector, 'href', matcher)
238
- end
239
- # shorthand for +attr(foo, 'src')+
240
- def self.src selector, matcher=nil
241
- self.attr(selector, 'src', matcher)
242
- end
243
-
244
- def self.proc selector, proc, matcher=nil
245
- MatchProc.new(selector, proc, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
246
- end
247
-
248
- # strip ","s (!! thus disrespecting locale !!!)
249
- # and convert to int
250
- def self.to_num selector, matcher=nil
251
- proc selector, lambda{|num| num.to_s.gsub(/,/,'').to_i if num }, matcher
252
- end
253
- def self.to_json selector, matcher=nil
254
- proc selector, lambda{|v| v.to_json if v }, matcher
255
- end
256
-
257
- def self.strip selector, matcher=nil
258
- proc selector, lambda{|v| v.strip }, matcher
259
- end
260
-
261
- def self.re_group selector, re
262
- MatchRegexp.new(selector, re)
263
- end
264
- def self.re selector, re
265
- MatchRegexp.new(selector, re, nil, :capture => 1)
266
- end
267
- def self.re_all selector, re, matcher=nil
268
- MatchRegexpRepeatedly.new(selector, re)
269
- end
270
-
271
- # def self.plain_text selector, matcher=nil
272
- # proc selector, lambda{|el| el.inner_text if el }, matcher
273
- # end
274
-
275
- # attr_accessor :mapping
276
- #
277
- # #
278
- # # Feed me a hash and I'll semantify HTML
279
- # #
280
- # # The hash should magically adhere to the too-complicated,
281
- # # ever evolving goatrope that works for the below
282
- # #
283
- # #
284
- # def initialize mapping
285
- # self.mapping = mapping
286
- # end
287
- #
288
- # #
289
- # # take a document subtree,
290
- # # and a mapping of hpricot paths to that subtree's data mapping
291
- # # recursively extract that datamapping
292
- # #
293
- # def extract_tree hdoc, content, sub_mapping
294
- # data = { }
295
- # sub_mapping.each do |selector, target|
296
- # data[selector] = []
297
- # sub_contents = content/selector
298
- # sub_contents.each do |sub_content|
299
- # sub_data = {}
300
- # extract_node hdoc, sub_content, sub_data, selector, target
301
- # data[selector] << sub_data
302
- # end
303
- # end
304
- # data
305
- # # end
306
- # # if selector.is_a?(String)
307
- # # conts = (content)
308
- # # else
309
- # # conts = [content]
310
- # # end
311
- # # conts[0..0].each do |content|
312
- # # extract_node hdoc, content, data, selector, target
313
- # # end
314
- # # end
315
- # data
316
- # end
317
- #
318
- # #
319
- # # insert the extracted element into the data mapping
320
- # #
321
- # def extract_node hdoc, content, data, selector, target
322
- # classification = classify_node(selector, target)
323
- # result = \
324
- # case classification
325
- # when :subtree
326
- # target.each do |sub_selector, sub_target|
327
- # extract_node hdoc, content, data, sub_selector, sub_target
328
- # end
329
- #
330
- # when :sub_attribute
331
- # k, v = selector.to_a[0]
332
- # subcontent = (k[0..0] == '/') ? (hdoc.at(k)) : (content.at(k))
333
- # val = subcontent.attributes[v.to_s] if subcontent
334
- # data[target] = val unless val.blank?
335
- #
336
- # when :attribute then
337
- # val = content.attributes[selector.to_s]
338
- # data[target] = val unless val.blank?
339
- #
340
- # when :flatten_list
341
- # subcontents = (selector[0..0] == '/') ? (hdoc/selector) : (content/selector)
342
- # data[target.first] = subcontents.map{|subcontent| subcontent.inner_html }
343
- #
344
- # when :inner_html
345
- # subcontent = (selector[0..0] == '/') ? (hdoc.at(selector)) : (content.at(selector))
346
- # data[target] = subcontent.inner_html.strip if subcontent
347
- #
348
- # else
349
- # raise "classify_node shouldn't ever return #{classification}"
350
- # end
351
- # # puts "%-19s %-19s %-31s %s" % [target.inspect[0..18], classification.inspect[0..18], selector.inspect[0..30], result.inspect[0..90]] if (classification == :sub_attribute)
352
- # # puts '' if classification == :subtree
353
- # end
354
- #
355
- # def classify_node selector, target
356
- # case
357
- # when target.is_a?(Hash) then :subtree
358
- # when selector.is_a?(Hash) && (selector.length == 1) then
359
- # k, v = selector.to_a[0]
360
- # case v
361
- # when Symbol then :sub_attribute
362
- # end
363
- # when selector.is_a?(Symbol) then :attribute
364
- # when selector.is_a?(String) && target.is_a?(Array) then :flatten_list
365
- # when selector.is_a?(String) && target.is_a?(Symbol) then :inner_html
366
- # else
367
- # raise "Can't classify mapping: " + [selector, target].join(" - ")
368
- # end
369
- # end
370
- #
371
- # # use #mapping to parse file
372
- # def parse link
373
- # begin hdoc = Hpricot(link.contents)
374
- # rescue; warn "can't hpricot #{link.to_s}" ; return false; end
375
- # raw_taggings = extract_tree hdoc, hdoc, self.mapping
376
- # end
377
- #
378
- # # use #mapping to parse file
379
- # def parse_file filename
380
- # begin hdoc = Hpricot(File.open(filename))
381
- # rescue; warn "can't hpricot #{filename}" ; return false; end
382
- # raw_taggings = extract_tree hdoc, hdoc, self.mapping
383
- # end
384
- end
385
- end
386
- end
387
-