imw 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +15 -0
- data/CHANGELOG +0 -0
- data/LICENSE +674 -0
- data/README.rdoc +101 -0
- data/Rakefile +20 -0
- data/VERSION +1 -0
- data/etc/imwrc.rb +76 -0
- data/lib/imw.rb +42 -0
- data/lib/imw/boot.rb +58 -0
- data/lib/imw/dataset.rb +233 -0
- data/lib/imw/dataset/datamapper.rb +66 -0
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
- data/lib/imw/dataset/loaddump.rb +50 -0
- data/lib/imw/dataset/old/file_collection.rb +88 -0
- data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
- data/lib/imw/dataset/scaffold.rb +132 -0
- data/lib/imw/dataset/scraped_uri.rb +305 -0
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
- data/lib/imw/dataset/scrub/scrub.rb +147 -0
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
- data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
- data/lib/imw/dataset/scrub/slug.rb +101 -0
- data/lib/imw/dataset/stats.rb +73 -0
- data/lib/imw/dataset/stats/counter.rb +23 -0
- data/lib/imw/dataset/task.rb +38 -0
- data/lib/imw/dataset/workflow.rb +81 -0
- data/lib/imw/files.rb +110 -0
- data/lib/imw/files/archive.rb +113 -0
- data/lib/imw/files/basicfile.rb +122 -0
- data/lib/imw/files/binary.rb +28 -0
- data/lib/imw/files/compressed_file.rb +93 -0
- data/lib/imw/files/compressed_files_and_archives.rb +348 -0
- data/lib/imw/files/compressible.rb +103 -0
- data/lib/imw/files/csv.rb +112 -0
- data/lib/imw/files/json.rb +41 -0
- data/lib/imw/files/sgml.rb +65 -0
- data/lib/imw/files/text.rb +68 -0
- data/lib/imw/files/yaml.rb +46 -0
- data/lib/imw/packagers.rb +8 -0
- data/lib/imw/packagers/archiver.rb +108 -0
- data/lib/imw/packagers/s3_mover.rb +28 -0
- data/lib/imw/parsers.rb +7 -0
- data/lib/imw/parsers/html_parser.rb +382 -0
- data/lib/imw/parsers/html_parser/matchers.rb +306 -0
- data/lib/imw/parsers/line_parser.rb +87 -0
- data/lib/imw/parsers/regexp_parser.rb +72 -0
- data/lib/imw/utils.rb +24 -0
- data/lib/imw/utils/components.rb +61 -0
- data/lib/imw/utils/config.rb +46 -0
- data/lib/imw/utils/error.rb +54 -0
- data/lib/imw/utils/extensions/array.rb +125 -0
- data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
- data/lib/imw/utils/extensions/core.rb +43 -0
- data/lib/imw/utils/extensions/dir.rb +24 -0
- data/lib/imw/utils/extensions/file_core.rb +64 -0
- data/lib/imw/utils/extensions/hash.rb +218 -0
- data/lib/imw/utils/extensions/hpricot.rb +48 -0
- data/lib/imw/utils/extensions/string.rb +49 -0
- data/lib/imw/utils/extensions/struct.rb +42 -0
- data/lib/imw/utils/extensions/symbol.rb +28 -0
- data/lib/imw/utils/extensions/typed_struct.rb +22 -0
- data/lib/imw/utils/extensions/uri.rb +59 -0
- data/lib/imw/utils/log.rb +67 -0
- data/lib/imw/utils/misc.rb +63 -0
- data/lib/imw/utils/paths.rb +115 -0
- data/lib/imw/utils/uri.rb +59 -0
- data/lib/imw/utils/uuid.rb +33 -0
- data/lib/imw/utils/validate.rb +38 -0
- data/lib/imw/utils/version.rb +12 -0
- data/lib/imw/utils/view.rb +113 -0
- data/lib/imw/utils/view/dump_csv.rb +112 -0
- data/lib/imw/utils/view/dump_csv_older.rb +117 -0
- data/spec/data/sample.csv +131 -0
- data/spec/data/sample.tsv +131 -0
- data/spec/data/sample.txt +131 -0
- data/spec/data/sample.xml +653 -0
- data/spec/data/sample.yaml +652 -0
- data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
- data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
- data/spec/imw/files/archive_spec.rb +118 -0
- data/spec/imw/files/basicfile_spec.rb +121 -0
- data/spec/imw/files/bz2_spec.rb +32 -0
- data/spec/imw/files/compressed_file_spec.rb +96 -0
- data/spec/imw/files/compressible_spec.rb +100 -0
- data/spec/imw/files/file_spec.rb +144 -0
- data/spec/imw/files/gz_spec.rb +32 -0
- data/spec/imw/files/rar_spec.rb +33 -0
- data/spec/imw/files/tar_spec.rb +31 -0
- data/spec/imw/files/text_spec.rb +23 -0
- data/spec/imw/files/zip_spec.rb +31 -0
- data/spec/imw/files_spec.rb +38 -0
- data/spec/imw/packagers/archiver_spec.rb +125 -0
- data/spec/imw/packagers/s3_mover_spec.rb +7 -0
- data/spec/imw/parsers/line_parser_spec.rb +96 -0
- data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
- data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
- data/spec/imw/utils/extensions/find_spec.rb +113 -0
- data/spec/imw/utils/paths_spec.rb +38 -0
- data/spec/imw/workflow/rip/local_spec.rb +89 -0
- data/spec/imw/workflow/rip_spec.rb +27 -0
- data/spec/rcov.opts +1 -0
- data/spec/spec.opts +4 -0
- data/spec/spec_helper.rb +32 -0
- data/spec/support/archive_contents_matcher.rb +94 -0
- data/spec/support/custom_matchers.rb +21 -0
- data/spec/support/directory_contents_matcher.rb +61 -0
- data/spec/support/extensions.rb +18 -0
- data/spec/support/file_contents_matcher.rb +50 -0
- data/spec/support/random.rb +210 -0
- data/spec/support/without_regard_to_order_matcher.rb +58 -0
- metadata +196 -0
@@ -0,0 +1,108 @@
|
|
1
|
+
module IMW
|
2
|
+
module Packagers
|
3
|
+
|
4
|
+
# Packages an Array of input files into a single output archive.
|
5
|
+
# When the archive is extracted, all the input files given will be
|
6
|
+
# in a single directory with a chosen name. The path to the output
|
7
|
+
# archive determines both the name of the archive and its type (tar,
|
8
|
+
# tar.bz2, zip, &c.).
|
9
|
+
#
|
10
|
+
# If any of the input files are themselves archives, they will first
|
11
|
+
# be extracted, with only their contents winding up in the final
|
12
|
+
# directory (the file hierarchy of the archive will be preserved).
|
13
|
+
# If any of the input files are compressed, they will first be
|
14
|
+
# uncompressed before being added to the directory.
|
15
|
+
#
|
16
|
+
# Input files can be renamed by passing in a Hash instead of an
|
17
|
+
# Array. Each key in this hash is the path to an input file and its
|
18
|
+
# value is the new basename to give it. If the basename is +nil+
|
19
|
+
# then the original path's basename will be used.
|
20
|
+
class Archiver
|
21
|
+
|
22
|
+
attr_accessor :name, :inputs
|
23
|
+
|
24
|
+
def initialize name, inputs
|
25
|
+
@name = name
|
26
|
+
add_inputs inputs
|
27
|
+
end
|
28
|
+
|
29
|
+
def add_inputs new_inputs
|
30
|
+
@inputs ||= {}
|
31
|
+
if new_inputs.is_a?(Array)
|
32
|
+
new_inputs.each do |input|
|
33
|
+
@inputs[File.expand_path(input)] = File.basename(input)
|
34
|
+
end
|
35
|
+
else
|
36
|
+
new_inputs.each_pair do |input, basename|
|
37
|
+
@inputs[File.expand_path(input)] = (basename || File.basename(input))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def errors
|
43
|
+
@errors ||= []
|
44
|
+
end
|
45
|
+
|
46
|
+
def add_processing_error error
|
47
|
+
IMW.logger.warn error
|
48
|
+
errors << error
|
49
|
+
end
|
50
|
+
|
51
|
+
def success?
|
52
|
+
errors.empty?
|
53
|
+
end
|
54
|
+
|
55
|
+
# A temporary directory to work in. Its contents will
|
56
|
+
# ultimately consist of a directory named for the package
|
57
|
+
# containing all the input files.
|
58
|
+
def tmp_dir
|
59
|
+
@tmp_dir ||= File.join(IMW.path_to(:tmp_root, 'packager'), (Time.now.to_i.to_s + "-" + $$.to_s)) # guaranteed unique on a node
|
60
|
+
end
|
61
|
+
|
62
|
+
def clean!
|
63
|
+
FileUtils.rm_rf(tmp_dir)
|
64
|
+
end
|
65
|
+
|
66
|
+
# A directory which will contain all the content being packaged,
|
67
|
+
# including the contents of any archives that were included in
|
68
|
+
# the list of files to process.
|
69
|
+
def dir
|
70
|
+
@dir ||= File.join(tmp_dir, name.to_s)
|
71
|
+
end
|
72
|
+
|
73
|
+
def prepare!
|
74
|
+
FileUtils.mkdir_p dir unless File.exist?(dir)
|
75
|
+
inputs.each_pair do |path, basename|
|
76
|
+
new_path = File.join(dir, basename)
|
77
|
+
file = IMW.open(path, :as => IMW::Files.file_class_for(basename)) # file's original path is meaningless: RackMultipart20091203-958-1nkgc61-0
|
78
|
+
case
|
79
|
+
when file.archive?
|
80
|
+
FileUtils.cd(dir) do
|
81
|
+
file.extract
|
82
|
+
end
|
83
|
+
when file.compressed?
|
84
|
+
file.cp(new_path).decompress!
|
85
|
+
else
|
86
|
+
file.cp(new_path)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
# Package the contents of the temporary directory to an archive
|
92
|
+
# at +output+.
|
93
|
+
def package! output, options={}
|
94
|
+
output = IMW.open(output) if output.is_a?(String)
|
95
|
+
FileUtils.mkdir_p(output.dirname) unless File.exist?(output.dirname)
|
96
|
+
output.rm! if output.exist?
|
97
|
+
FileUtils.cd(tmp_dir) do
|
98
|
+
temp_output = IMW.open(output.basename)
|
99
|
+
packaged_output = temp_output.create(name.to_s + '/*').mv(output.path)
|
100
|
+
temp_output.rm if temp_output.exist?
|
101
|
+
add_processing_error "Archiver: couldn't create archive #{output.path}" unless output.exists?
|
102
|
+
end
|
103
|
+
output
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'aws/s3'
|
2
|
+
module IMW
|
3
|
+
module Packagers
|
4
|
+
class S3Mover
|
5
|
+
|
6
|
+
attr_reader :last_response
|
7
|
+
attr_accessor :bucket_name
|
8
|
+
|
9
|
+
def initialize options={}
|
10
|
+
@bucket_name = options.delete(:bucket_name)
|
11
|
+
AWS::S3::Base.establish_connection!(options)
|
12
|
+
end
|
13
|
+
|
14
|
+
def success?
|
15
|
+
errors.empty?
|
16
|
+
end
|
17
|
+
|
18
|
+
def success?
|
19
|
+
last_response && last_response.response.class == Net::HTTPOK
|
20
|
+
end
|
21
|
+
|
22
|
+
def upload! local_path, remote_path
|
23
|
+
@last_response = AWS::S3::S3Object.store(remote_path, open(local_path), bucket_name)
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
data/lib/imw/parsers.rb
ADDED
@@ -0,0 +1,382 @@
|
|
1
|
+
#
|
2
|
+
# h2. lib/imw/parsers/html_parser.rb -- html parser
|
3
|
+
#
|
4
|
+
# == About
|
5
|
+
#
|
6
|
+
# h4. HTML Extractor
|
7
|
+
#
|
8
|
+
# * map repeating HTML elements to intermediate ruby data structure
|
9
|
+
# * optimize all the common cases for expressive brevity
|
10
|
+
# * output structure will come from HTML structure; map to desired output objects in transform stage.
|
11
|
+
# * spec shouldn't be allowed to get too much more complicated than this; again, transform stage exists
|
12
|
+
#
|
13
|
+
# If this doesn't yield satisfaction you may enjoy
|
14
|
+
# * http://blog.labnotes.org/2006/07/11/scraping-with-style-scrapi-toolkit-for-ruby/
|
15
|
+
# * http://scrubyt.org/
|
16
|
+
# Note of course that these have quite different goals. For example, we don't
|
17
|
+
# have any interest in "interactive" crawling, eg form submission, or at least
|
18
|
+
# that goes elsewhere.
|
19
|
+
#
|
20
|
+
#
|
21
|
+
# == Sample HTML (http://twitter.com:
|
22
|
+
#
|
23
|
+
# <ul class="about vcard entry-author">
|
24
|
+
# <li ><span class="label">Name</span> <span class="fn" >MarsPhoenix </span> </li>
|
25
|
+
# <li ><span class="label">Location</span> <span class="adr">Mars, Solar System</span> </li>
|
26
|
+
# <li id="bio"><span class="label">Bio</span> <span class="bio">I dig Mars! </span> </li>
|
27
|
+
# <li ><span class="label">Web</span>
|
28
|
+
# <a href="http://tinyurl.com/5wwaru" class="url" rel="me nofollow">http://tinyurl.co...</a></li>
|
29
|
+
# </ul>
|
30
|
+
#
|
31
|
+
# == Parser Spec:
|
32
|
+
# :hcard => m_one('//ul.vcard.about',
|
33
|
+
# {
|
34
|
+
# :name => 'li/span.fn',
|
35
|
+
# :location => 'li/span.adr',
|
36
|
+
# :url => m_attr('li/a.url[@href]', 'href'),
|
37
|
+
# :bio => 'li#bio/span.bio',
|
38
|
+
# }
|
39
|
+
# )
|
40
|
+
#
|
41
|
+
# == Example return:
|
42
|
+
# { :hcard => { :name => 'Mars Phoenix', :location => 'Mars, Solar System', :bio => 'I dig Mars!', :url => 'http://tinyurl.com/5wwaru' } }
|
43
|
+
#
|
44
|
+
# == Sample HTML (http://delicious.com):
|
45
|
+
# <ul id="bookmarklist" class="bookmarks NOTHUMB">
|
46
|
+
# <li class="post" id="item-...">
|
47
|
+
# <div class="bookmark NOTHUMB">
|
48
|
+
# <div class="dateGroup"> <span title="23 APR 08">23 APR 08</span> </div>
|
49
|
+
# <div class="data">
|
50
|
+
# <h4> <a rel="nofollow" class="taggedlink" href="http://www.cs.biu.ac.il/~koppel/BlogCorpus.htm">Blog Authorship Corpus (Blogger.com 1994)</a>
|
51
|
+
# <a class="inlinesave" href="...">SAVE</a> </h4>
|
52
|
+
# <h5 class="savers-label"> PEOPLE</h5>
|
53
|
+
# <div class="savers savers2"> <a class="delNav" href="/url/7df6661946fca61863312644eb071953"><span class="delNavCount">26</span></a> </div>
|
54
|
+
# <div class="description"> The Blog Authorship Corpus consists of the collected posts of 19,320 bloggers gathered from blogger.com in August 2004. The corpus incorporates a total of 681,288 posts and over 140 million words - or approximately 35 posts and 7250 words per person. </div>
|
55
|
+
# </div>
|
56
|
+
# <div class="meta"></div>
|
57
|
+
# <h5 class="tag-chain-label">TAGS</h5>
|
58
|
+
# <div class="tagdisplay">
|
59
|
+
# <ul class="tag-chain">
|
60
|
+
# <li class="tag-chain-item off first"><a class="tag-chain-item-link" rel="tag" href="/infochimps/blog" ><span class="tag-chain-item-span">blog</span> </a></li>
|
61
|
+
# <li class="tag-chain-item off"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/corpus" ><span class="tag-chain-item-span">corpus</span> </a></li>
|
62
|
+
# <li class="tag-chain-item off"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/analysis" ><span class="tag-chain-item-span">analysis</span></a></li>
|
63
|
+
# <li class="tag-chain-item off"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/nlp" ><span class="tag-chain-item-span">nlp</span> </a></li>
|
64
|
+
# <li class="tag-chain-item on last"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/dataset" ><span class="tag-chain-item-span">dataset</span> </a></li>
|
65
|
+
# </ul>
|
66
|
+
# </div>
|
67
|
+
# <div class="clr"></div>
|
68
|
+
# </div>
|
69
|
+
# </li>
|
70
|
+
# </ul>
|
71
|
+
#
|
72
|
+
# == Parser Specification:
|
73
|
+
# :bookmarks => [ 'ul#bookmarklist/li.post/.bookmark',
|
74
|
+
# {
|
75
|
+
# :date => hash( '.dateGroup/span',
|
76
|
+
# [:year, :month, :day] => regexp( '', /(\d{2}) ([A-Z]{3}) (\d{2})/),
|
77
|
+
# ),
|
78
|
+
# :title => '.data/h4/a.taggedlink',
|
79
|
+
# :url => attr( '.data/h4/a.taggedlink', 'href'),
|
80
|
+
# :del_link_url => href( '.data/.savers/a.delNav),
|
81
|
+
# :num_savers => to_i( '.data/.savers//span.delNavCount'),
|
82
|
+
# :description => '.data/.description',
|
83
|
+
# :tags => ['.tagdisplay//tag-chain-item-span']
|
84
|
+
# }
|
85
|
+
# ]
|
86
|
+
#
|
87
|
+
# == Example output:
|
88
|
+
# { :bookmarks => [
|
89
|
+
# { :date => { :year => '08', :month => 'APR', :day => '23' },
|
90
|
+
# :title => 'Blog Authorship Corpus (Blogger.com 1994)',
|
91
|
+
# :url => 'http://www.cs.biu.ac.il/~koppel/BlogCorpus.htm',
|
92
|
+
# :del_link_url => '/url/7df6661946fca61863312644eb071953',
|
93
|
+
# :num_savers => 26,
|
94
|
+
# :description => 'The Blog ... ',
|
95
|
+
# :tags => ['blog', 'corpus', 'analysis', 'nlp', 'dataset'],
|
96
|
+
# }
|
97
|
+
# ]}
|
98
|
+
#
|
99
|
+
# == Implementation:
|
100
|
+
#
|
101
|
+
# Internally, we take the spec and turn it into a recursive structure of Matcher
|
102
|
+
# objects. These consume Hpricot Elements and return the appropriately extracted
|
103
|
+
# object.
|
104
|
+
#
|
105
|
+
# Note that the /default/ is for a bare selector to match ONE element, and to not
|
106
|
+
# complain if there are many.
|
107
|
+
#
|
108
|
+
# Missing elements are silently ignored -- for example if
|
109
|
+
# :foo => 'li.missing'
|
110
|
+
# there will simply be no :foo element in the hash (as opposed to having hsh[:foo]
|
111
|
+
# set to nil -- hsh.include?(foo) will be false)
|
112
|
+
#
|
113
|
+
#
|
114
|
+
# == List of Matchers:
|
115
|
+
# { :field => /spec/, ... } # hash hash, each field taken from spec.
|
116
|
+
# [ "hpricot_path" ] # 1-el array array: for each element matching
|
117
|
+
# hpricot_path, the inner_html
|
118
|
+
# [ "hpricot_path", /spec/ ] # 2-el array array: for each element matching
|
119
|
+
# hpricot_path, pass to spec
|
120
|
+
# "hpricot_path" # string same as one("hpricot_path")
|
121
|
+
# one("hpricot_path") # one first match to hpricot_path
|
122
|
+
# one("hpricot_path", /spec/) # one applies spec to first match to hpricot_path
|
123
|
+
# (these all match on one path:)
|
124
|
+
# regexp("hpricot_path", /RE/) # regexp capture groups from matching RE against
|
125
|
+
# inner_html of first match to hpricot_path
|
126
|
+
# attr("hpricot_path", 'attr_name') # attr
|
127
|
+
# href("hpricot_path") # href shorthand for attr(foo, 'href')
|
128
|
+
# no_html # strip tags from contents
|
129
|
+
# html_encoded # html encode contents
|
130
|
+
# to_i, to_f, etc # convert
|
131
|
+
# lambda{|doc| ... } # proc calls proc on current doc
|
132
|
+
#
|
133
|
+
# == Complicated HCard example:
|
134
|
+
# :hcards => [ '//ul.users/li.vcard',
|
135
|
+
# {
|
136
|
+
# :name => '.fn',
|
137
|
+
# :address => one('.adr',
|
138
|
+
# :street => '.street',
|
139
|
+
# :city => '.city',
|
140
|
+
# :zip => '.postal'
|
141
|
+
# )
|
142
|
+
# :tel => [ 'span.tel',
|
143
|
+
# {
|
144
|
+
# :type => 'span.type',
|
145
|
+
# [:cc, :area, :num] => hp.regexp('span.value', /+(\d+).(\d{3})-(\d{3}-\d{4})/),
|
146
|
+
# }
|
147
|
+
# ]
|
148
|
+
# :tags => [ '.tag' ],
|
149
|
+
# }
|
150
|
+
# ]
|
151
|
+
#
|
152
|
+
# == Resulting Parser
|
153
|
+
# MatchHash({:hcards => MatchArray('//ul.users/li.hcard',
|
154
|
+
# MatchHash({
|
155
|
+
# :name => MatchFirst('.fn'),
|
156
|
+
# :address => MatchFirst('.adr',
|
157
|
+
# MatchHash({
|
158
|
+
# :street => MatchFirst('.street'),
|
159
|
+
# :city => MatchFirst('.locality),
|
160
|
+
# :state => MatchFirst('.region),
|
161
|
+
# :zip => MatchFirst('.postal'),
|
162
|
+
# }))
|
163
|
+
# :tel => MatchArray('span.tel',
|
164
|
+
# MatchHash({
|
165
|
+
# :type => MatchFirst('span.type'),
|
166
|
+
# [:cc, :area, :num] => RegexpMatcher('span.value', /+(\d+).(\d{3})-(\d{3}-\d{4})/),
|
167
|
+
# })
|
168
|
+
# )
|
169
|
+
# :tags => MatchArray('.tag'),
|
170
|
+
# })
|
171
|
+
# )
|
172
|
+
#
|
173
|
+
# == Example output
|
174
|
+
# [
|
175
|
+
# {:tel => [ {:type => 'home', :cc => '49', :area => '305', :num => '555-1212'},
|
176
|
+
# {:type => 'work', :cc => '49', :area => '305', :num => '555-6969'}, ],
|
177
|
+
# :name => "Bob Dobbs, Jr.",
|
178
|
+
# :tags => ["church"] },
|
179
|
+
# {:tel => [ {:type => 'fax', :cc => '49', :area => '305', :num => '867-5309'}, ],
|
180
|
+
# :name => "Jenny",
|
181
|
+
# :address => { :street => "53 Evergreen Terr.", :city => "Springfield" },
|
182
|
+
# :tags => ["bathroom", "wall"] },
|
183
|
+
# ]
|
184
|
+
#
|
185
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
186
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
187
|
+
# License:: GPL 3.0
|
188
|
+
# Website:: http://infinitemonkeywrench.org/
|
189
|
+
#
|
190
|
+
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
191
|
+
|
192
|
+
require 'imw/parsers/html_parser/matchers'
|
193
|
+
|
194
|
+
class IMW::HTMLParser
|
195
|
+
|
196
|
+
include IMW::HTMLParserMatcher
|
197
|
+
|
198
|
+
attr_accessor :parse_tree
|
199
|
+
|
200
|
+
#
|
201
|
+
# Parse Tree
|
202
|
+
#
|
203
|
+
def initialize arg_spec=nil
|
204
|
+
spec = arg_spec || self.class.parser_spec
|
205
|
+
self.parse_tree = IMW::HTMLParserMatcher.build_parse_tree(spec)
|
206
|
+
end
|
207
|
+
|
208
|
+
#
|
209
|
+
# See IMW::HTMLParser for syntax
|
210
|
+
#
|
211
|
+
#
|
212
|
+
def self.parser_spec
|
213
|
+
raise "Override this to create your own parser spec"
|
214
|
+
end
|
215
|
+
|
216
|
+
#
|
217
|
+
# Walk
|
218
|
+
#
|
219
|
+
def parse doc
|
220
|
+
self.parse_tree.match(doc)
|
221
|
+
end
|
222
|
+
|
223
|
+
# one("hpricot_path") first match to hpricot_path
|
224
|
+
# one("hpricot_path", /spec/) applies spec to first match to hpricot_path
|
225
|
+
#
|
226
|
+
def self.one selector, matcher
|
227
|
+
MatchFirstElement.new(selector, IMW::HTMLParserMatcher.build_parse_tree(matcher))
|
228
|
+
end
|
229
|
+
# match the +attr+ attribute of the first element given by +selector+
|
230
|
+
def self.attr selector, attr, matcher=nil
|
231
|
+
MatchAttribute.new(selector, attr, IMW::HTMLParserMatcher.build_parse_tree(matcher))
|
232
|
+
end
|
233
|
+
# shorthand for +attr(foo, 'href')+
|
234
|
+
def self.href selector, matcher=nil
|
235
|
+
self.attr(selector, 'href', matcher)
|
236
|
+
end
|
237
|
+
# shorthand for +attr(foo, 'src')+
|
238
|
+
def self.src selector, matcher=nil
|
239
|
+
self.attr(selector, 'src', matcher)
|
240
|
+
end
|
241
|
+
|
242
|
+
def self.proc selector, proc, matcher=nil
|
243
|
+
MatchProc.new(selector, proc, IMW::HTMLParserMatcher.build_parse_tree(matcher))
|
244
|
+
end
|
245
|
+
|
246
|
+
# strip ","s (!! thus disrespecting locale !!!)
|
247
|
+
# and convert to int
|
248
|
+
def self.to_num selector, matcher=nil
|
249
|
+
proc selector, lambda{|num| num.to_s.gsub(/,/,'').to_i if num }, matcher
|
250
|
+
end
|
251
|
+
def self.to_json selector, matcher=nil
|
252
|
+
proc selector, lambda{|v| v.to_json if v }, matcher
|
253
|
+
end
|
254
|
+
|
255
|
+
def self.strip selector, matcher=nil
|
256
|
+
proc selector, lambda{|v| v.strip }, matcher
|
257
|
+
end
|
258
|
+
|
259
|
+
def self.re_group selector, re
|
260
|
+
MatchRegexp.new(selector, re)
|
261
|
+
end
|
262
|
+
def self.re selector, re
|
263
|
+
MatchRegexp.new(selector, re, nil, :capture => 1)
|
264
|
+
end
|
265
|
+
def self.re_all selector, re, matcher=nil
|
266
|
+
MatchRegexpRepeatedly.new(selector, re)
|
267
|
+
end
|
268
|
+
|
269
|
+
# def self.plain_text selector, matcher=nil
|
270
|
+
# proc selector, lambda{|el| el.inner_text if el }, matcher
|
271
|
+
# end
|
272
|
+
|
273
|
+
# attr_accessor :mapping
|
274
|
+
#
|
275
|
+
# #
|
276
|
+
# # Feed me a hash and I'll semantify HTML
|
277
|
+
# #
|
278
|
+
# # The hash should magically adhere to the too-complicated,
|
279
|
+
# # ever evolving goatrope that works for the below
|
280
|
+
# #
|
281
|
+
# #
|
282
|
+
# def initialize mapping
|
283
|
+
# self.mapping = mapping
|
284
|
+
# end
|
285
|
+
#
|
286
|
+
# #
|
287
|
+
# # take a document subtree,
|
288
|
+
# # and a mapping of hpricot paths to that subtree's data mapping
|
289
|
+
# # recursively extract that datamapping
|
290
|
+
# #
|
291
|
+
# def extract_tree hdoc, content, sub_mapping
|
292
|
+
# data = { }
|
293
|
+
# sub_mapping.each do |selector, target|
|
294
|
+
# data[selector] = []
|
295
|
+
# sub_contents = content/selector
|
296
|
+
# sub_contents.each do |sub_content|
|
297
|
+
# sub_data = {}
|
298
|
+
# extract_node hdoc, sub_content, sub_data, selector, target
|
299
|
+
# data[selector] << sub_data
|
300
|
+
# end
|
301
|
+
# end
|
302
|
+
# data
|
303
|
+
# # end
|
304
|
+
# # if selector.is_a?(String)
|
305
|
+
# # conts = (content)
|
306
|
+
# # else
|
307
|
+
# # conts = [content]
|
308
|
+
# # end
|
309
|
+
# # conts[0..0].each do |content|
|
310
|
+
# # extract_node hdoc, content, data, selector, target
|
311
|
+
# # end
|
312
|
+
# # end
|
313
|
+
# data
|
314
|
+
# end
|
315
|
+
#
|
316
|
+
# #
|
317
|
+
# # insert the extracted element into the data mapping
|
318
|
+
# #
|
319
|
+
# def extract_node hdoc, content, data, selector, target
|
320
|
+
# classification = classify_node(selector, target)
|
321
|
+
# result = \
|
322
|
+
# case classification
|
323
|
+
# when :subtree
|
324
|
+
# target.each do |sub_selector, sub_target|
|
325
|
+
# extract_node hdoc, content, data, sub_selector, sub_target
|
326
|
+
# end
|
327
|
+
#
|
328
|
+
# when :sub_attribute
|
329
|
+
# k, v = selector.to_a[0]
|
330
|
+
# subcontent = (k[0..0] == '/') ? (hdoc.at(k)) : (content.at(k))
|
331
|
+
# val = subcontent.attributes[v.to_s] if subcontent
|
332
|
+
# data[target] = val unless val.blank?
|
333
|
+
#
|
334
|
+
# when :attribute then
|
335
|
+
# val = content.attributes[selector.to_s]
|
336
|
+
# data[target] = val unless val.blank?
|
337
|
+
#
|
338
|
+
# when :flatten_list
|
339
|
+
# subcontents = (selector[0..0] == '/') ? (hdoc/selector) : (content/selector)
|
340
|
+
# data[target.first] = subcontents.map{|subcontent| subcontent.inner_html }
|
341
|
+
#
|
342
|
+
# when :inner_html
|
343
|
+
# subcontent = (selector[0..0] == '/') ? (hdoc.at(selector)) : (content.at(selector))
|
344
|
+
# data[target] = subcontent.inner_html.strip if subcontent
|
345
|
+
#
|
346
|
+
# else
|
347
|
+
# raise "classify_node shouldn't ever return #{classification}"
|
348
|
+
# end
|
349
|
+
# # puts "%-19s %-19s %-31s %s" % [target.inspect[0..18], classification.inspect[0..18], selector.inspect[0..30], result.inspect[0..90]] if (classification == :sub_attribute)
|
350
|
+
# # puts '' if classification == :subtree
|
351
|
+
# end
|
352
|
+
#
|
353
|
+
# def classify_node selector, target
|
354
|
+
# case
|
355
|
+
# when target.is_a?(Hash) then :subtree
|
356
|
+
# when selector.is_a?(Hash) && (selector.length == 1) then
|
357
|
+
# k, v = selector.to_a[0]
|
358
|
+
# case v
|
359
|
+
# when Symbol then :sub_attribute
|
360
|
+
# end
|
361
|
+
# when selector.is_a?(Symbol) then :attribute
|
362
|
+
# when selector.is_a?(String) && target.is_a?(Array) then :flatten_list
|
363
|
+
# when selector.is_a?(String) && target.is_a?(Symbol) then :inner_html
|
364
|
+
# else
|
365
|
+
# raise "Can't classify mapping: " + [selector, target].join(" - ")
|
366
|
+
# end
|
367
|
+
# end
|
368
|
+
#
|
369
|
+
# # use #mapping to parse file
|
370
|
+
# def parse link
|
371
|
+
# begin hdoc = Hpricot(link.contents)
|
372
|
+
# rescue; warn "can't hpricot #{link.to_s}" ; return false; end
|
373
|
+
# raw_taggings = extract_tree hdoc, hdoc, self.mapping
|
374
|
+
# end
|
375
|
+
#
|
376
|
+
# # use #mapping to parse file
|
377
|
+
# def parse_file filename
|
378
|
+
# begin hdoc = Hpricot(File.open(filename))
|
379
|
+
# rescue; warn "can't hpricot #{filename}" ; return false; end
|
380
|
+
# raw_taggings = extract_tree hdoc, hdoc, self.mapping
|
381
|
+
# end
|
382
|
+
end
|