imw 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +15 -0
- data/CHANGELOG +0 -0
- data/LICENSE +674 -0
- data/README.rdoc +101 -0
- data/Rakefile +20 -0
- data/VERSION +1 -0
- data/etc/imwrc.rb +76 -0
- data/lib/imw.rb +42 -0
- data/lib/imw/boot.rb +58 -0
- data/lib/imw/dataset.rb +233 -0
- data/lib/imw/dataset/datamapper.rb +66 -0
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
- data/lib/imw/dataset/loaddump.rb +50 -0
- data/lib/imw/dataset/old/file_collection.rb +88 -0
- data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
- data/lib/imw/dataset/scaffold.rb +132 -0
- data/lib/imw/dataset/scraped_uri.rb +305 -0
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
- data/lib/imw/dataset/scrub/scrub.rb +147 -0
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
- data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
- data/lib/imw/dataset/scrub/slug.rb +101 -0
- data/lib/imw/dataset/stats.rb +73 -0
- data/lib/imw/dataset/stats/counter.rb +23 -0
- data/lib/imw/dataset/task.rb +38 -0
- data/lib/imw/dataset/workflow.rb +81 -0
- data/lib/imw/files.rb +110 -0
- data/lib/imw/files/archive.rb +113 -0
- data/lib/imw/files/basicfile.rb +122 -0
- data/lib/imw/files/binary.rb +28 -0
- data/lib/imw/files/compressed_file.rb +93 -0
- data/lib/imw/files/compressed_files_and_archives.rb +348 -0
- data/lib/imw/files/compressible.rb +103 -0
- data/lib/imw/files/csv.rb +112 -0
- data/lib/imw/files/json.rb +41 -0
- data/lib/imw/files/sgml.rb +65 -0
- data/lib/imw/files/text.rb +68 -0
- data/lib/imw/files/yaml.rb +46 -0
- data/lib/imw/packagers.rb +8 -0
- data/lib/imw/packagers/archiver.rb +108 -0
- data/lib/imw/packagers/s3_mover.rb +28 -0
- data/lib/imw/parsers.rb +7 -0
- data/lib/imw/parsers/html_parser.rb +382 -0
- data/lib/imw/parsers/html_parser/matchers.rb +306 -0
- data/lib/imw/parsers/line_parser.rb +87 -0
- data/lib/imw/parsers/regexp_parser.rb +72 -0
- data/lib/imw/utils.rb +24 -0
- data/lib/imw/utils/components.rb +61 -0
- data/lib/imw/utils/config.rb +46 -0
- data/lib/imw/utils/error.rb +54 -0
- data/lib/imw/utils/extensions/array.rb +125 -0
- data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
- data/lib/imw/utils/extensions/core.rb +43 -0
- data/lib/imw/utils/extensions/dir.rb +24 -0
- data/lib/imw/utils/extensions/file_core.rb +64 -0
- data/lib/imw/utils/extensions/hash.rb +218 -0
- data/lib/imw/utils/extensions/hpricot.rb +48 -0
- data/lib/imw/utils/extensions/string.rb +49 -0
- data/lib/imw/utils/extensions/struct.rb +42 -0
- data/lib/imw/utils/extensions/symbol.rb +28 -0
- data/lib/imw/utils/extensions/typed_struct.rb +22 -0
- data/lib/imw/utils/extensions/uri.rb +59 -0
- data/lib/imw/utils/log.rb +67 -0
- data/lib/imw/utils/misc.rb +63 -0
- data/lib/imw/utils/paths.rb +115 -0
- data/lib/imw/utils/uri.rb +59 -0
- data/lib/imw/utils/uuid.rb +33 -0
- data/lib/imw/utils/validate.rb +38 -0
- data/lib/imw/utils/version.rb +12 -0
- data/lib/imw/utils/view.rb +113 -0
- data/lib/imw/utils/view/dump_csv.rb +112 -0
- data/lib/imw/utils/view/dump_csv_older.rb +117 -0
- data/spec/data/sample.csv +131 -0
- data/spec/data/sample.tsv +131 -0
- data/spec/data/sample.txt +131 -0
- data/spec/data/sample.xml +653 -0
- data/spec/data/sample.yaml +652 -0
- data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
- data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
- data/spec/imw/files/archive_spec.rb +118 -0
- data/spec/imw/files/basicfile_spec.rb +121 -0
- data/spec/imw/files/bz2_spec.rb +32 -0
- data/spec/imw/files/compressed_file_spec.rb +96 -0
- data/spec/imw/files/compressible_spec.rb +100 -0
- data/spec/imw/files/file_spec.rb +144 -0
- data/spec/imw/files/gz_spec.rb +32 -0
- data/spec/imw/files/rar_spec.rb +33 -0
- data/spec/imw/files/tar_spec.rb +31 -0
- data/spec/imw/files/text_spec.rb +23 -0
- data/spec/imw/files/zip_spec.rb +31 -0
- data/spec/imw/files_spec.rb +38 -0
- data/spec/imw/packagers/archiver_spec.rb +125 -0
- data/spec/imw/packagers/s3_mover_spec.rb +7 -0
- data/spec/imw/parsers/line_parser_spec.rb +96 -0
- data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
- data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
- data/spec/imw/utils/extensions/find_spec.rb +113 -0
- data/spec/imw/utils/paths_spec.rb +38 -0
- data/spec/imw/workflow/rip/local_spec.rb +89 -0
- data/spec/imw/workflow/rip_spec.rb +27 -0
- data/spec/rcov.opts +1 -0
- data/spec/spec.opts +4 -0
- data/spec/spec_helper.rb +32 -0
- data/spec/support/archive_contents_matcher.rb +94 -0
- data/spec/support/custom_matchers.rb +21 -0
- data/spec/support/directory_contents_matcher.rb +61 -0
- data/spec/support/extensions.rb +18 -0
- data/spec/support/file_contents_matcher.rb +50 -0
- data/spec/support/random.rb +210 -0
- data/spec/support/without_regard_to_order_matcher.rb +58 -0
- metadata +196 -0
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
module IMW
|
|
2
|
+
module Packagers
|
|
3
|
+
|
|
4
|
+
# Packages an Array of input files into a single output archive.
|
|
5
|
+
# When the archive is extracted, all the input files given will be
|
|
6
|
+
# in a single directory with a chosen name. The path to the output
|
|
7
|
+
# archive determines both the name of the archive and its type (tar,
|
|
8
|
+
# tar.bz2, zip, &c.).
|
|
9
|
+
#
|
|
10
|
+
# If any of the input files are themselves archives, they will first
|
|
11
|
+
# be extracted, with only their contents winding up in the final
|
|
12
|
+
# directory (the file hierarchy of the archive will be preserved).
|
|
13
|
+
# If any of the input files are compressed, they will first be
|
|
14
|
+
# uncompressed before being added to the directory.
|
|
15
|
+
#
|
|
16
|
+
# Input files can be renamed by passing in a Hash instead of an
|
|
17
|
+
# Array. Each key in this hash is the path to an input file and its
|
|
18
|
+
# value is the new basename to give it. If the basename is +nil+
|
|
19
|
+
# then the original path's basename will be used.
|
|
20
|
+
class Archiver
|
|
21
|
+
|
|
22
|
+
attr_accessor :name, :inputs
|
|
23
|
+
|
|
24
|
+
def initialize name, inputs
|
|
25
|
+
@name = name
|
|
26
|
+
add_inputs inputs
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def add_inputs new_inputs
|
|
30
|
+
@inputs ||= {}
|
|
31
|
+
if new_inputs.is_a?(Array)
|
|
32
|
+
new_inputs.each do |input|
|
|
33
|
+
@inputs[File.expand_path(input)] = File.basename(input)
|
|
34
|
+
end
|
|
35
|
+
else
|
|
36
|
+
new_inputs.each_pair do |input, basename|
|
|
37
|
+
@inputs[File.expand_path(input)] = (basename || File.basename(input))
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def errors
|
|
43
|
+
@errors ||= []
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def add_processing_error error
|
|
47
|
+
IMW.logger.warn error
|
|
48
|
+
errors << error
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def success?
|
|
52
|
+
errors.empty?
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# A temporary directory to work in. Its contents will
|
|
56
|
+
# ultimately consist of a directory named for the package
|
|
57
|
+
# containing all the input files.
|
|
58
|
+
def tmp_dir
|
|
59
|
+
@tmp_dir ||= File.join(IMW.path_to(:tmp_root, 'packager'), (Time.now.to_i.to_s + "-" + $$.to_s)) # guaranteed unique on a node
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def clean!
|
|
63
|
+
FileUtils.rm_rf(tmp_dir)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# A directory which will contain all the content being packaged,
|
|
67
|
+
# including the contents of any archives that were included in
|
|
68
|
+
# the list of files to process.
|
|
69
|
+
def dir
|
|
70
|
+
@dir ||= File.join(tmp_dir, name.to_s)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def prepare!
|
|
74
|
+
FileUtils.mkdir_p dir unless File.exist?(dir)
|
|
75
|
+
inputs.each_pair do |path, basename|
|
|
76
|
+
new_path = File.join(dir, basename)
|
|
77
|
+
file = IMW.open(path, :as => IMW::Files.file_class_for(basename)) # file's original path is meaningless: RackMultipart20091203-958-1nkgc61-0
|
|
78
|
+
case
|
|
79
|
+
when file.archive?
|
|
80
|
+
FileUtils.cd(dir) do
|
|
81
|
+
file.extract
|
|
82
|
+
end
|
|
83
|
+
when file.compressed?
|
|
84
|
+
file.cp(new_path).decompress!
|
|
85
|
+
else
|
|
86
|
+
file.cp(new_path)
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Package the contents of the temporary directory to an archive
|
|
92
|
+
# at +output+.
|
|
93
|
+
def package! output, options={}
|
|
94
|
+
output = IMW.open(output) if output.is_a?(String)
|
|
95
|
+
FileUtils.mkdir_p(output.dirname) unless File.exist?(output.dirname)
|
|
96
|
+
output.rm! if output.exist?
|
|
97
|
+
FileUtils.cd(tmp_dir) do
|
|
98
|
+
temp_output = IMW.open(output.basename)
|
|
99
|
+
packaged_output = temp_output.create(name.to_s + '/*').mv(output.path)
|
|
100
|
+
temp_output.rm if temp_output.exist?
|
|
101
|
+
add_processing_error "Archiver: couldn't create archive #{output.path}" unless output.exists?
|
|
102
|
+
end
|
|
103
|
+
output
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
require 'aws/s3'
|
|
2
|
+
module IMW
|
|
3
|
+
module Packagers
|
|
4
|
+
class S3Mover
|
|
5
|
+
|
|
6
|
+
attr_reader :last_response
|
|
7
|
+
attr_accessor :bucket_name
|
|
8
|
+
|
|
9
|
+
def initialize options={}
|
|
10
|
+
@bucket_name = options.delete(:bucket_name)
|
|
11
|
+
AWS::S3::Base.establish_connection!(options)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def success?
|
|
15
|
+
errors.empty?
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def success?
|
|
19
|
+
last_response && last_response.response.class == Net::HTTPOK
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def upload! local_path, remote_path
|
|
23
|
+
@last_response = AWS::S3::S3Object.store(remote_path, open(local_path), bucket_name)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
data/lib/imw/parsers.rb
ADDED
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
#
|
|
2
|
+
# h2. lib/imw/parsers/html_parser.rb -- html parser
|
|
3
|
+
#
|
|
4
|
+
# == About
|
|
5
|
+
#
|
|
6
|
+
# h4. HTML Extractor
|
|
7
|
+
#
|
|
8
|
+
# * map repeating HTML elements to intermediate ruby data structure
|
|
9
|
+
# * optimize all the common cases for expressive brevity
|
|
10
|
+
# * output structure will come from HTML structure; map to desired output objects in transform stage.
|
|
11
|
+
# * spec shouldn't be allowed to get too much more complicated than this; again, transform stage exists
|
|
12
|
+
#
|
|
13
|
+
# If this doesn't yield satisfaction you may enjoy
|
|
14
|
+
# * http://blog.labnotes.org/2006/07/11/scraping-with-style-scrapi-toolkit-for-ruby/
|
|
15
|
+
# * http://scrubyt.org/
|
|
16
|
+
# Note of course that these have quite different goals. For example, we don't
|
|
17
|
+
# have any interest in "interactive" crawling, eg form submission, or at least
|
|
18
|
+
# that goes elsewhere.
|
|
19
|
+
#
|
|
20
|
+
#
|
|
21
|
+
# == Sample HTML (http://twitter.com:
|
|
22
|
+
#
|
|
23
|
+
# <ul class="about vcard entry-author">
|
|
24
|
+
# <li ><span class="label">Name</span> <span class="fn" >MarsPhoenix </span> </li>
|
|
25
|
+
# <li ><span class="label">Location</span> <span class="adr">Mars, Solar System</span> </li>
|
|
26
|
+
# <li id="bio"><span class="label">Bio</span> <span class="bio">I dig Mars! </span> </li>
|
|
27
|
+
# <li ><span class="label">Web</span>
|
|
28
|
+
# <a href="http://tinyurl.com/5wwaru" class="url" rel="me nofollow">http://tinyurl.co...</a></li>
|
|
29
|
+
# </ul>
|
|
30
|
+
#
|
|
31
|
+
# == Parser Spec:
|
|
32
|
+
# :hcard => m_one('//ul.vcard.about',
|
|
33
|
+
# {
|
|
34
|
+
# :name => 'li/span.fn',
|
|
35
|
+
# :location => 'li/span.adr',
|
|
36
|
+
# :url => m_attr('li/a.url[@href]', 'href'),
|
|
37
|
+
# :bio => 'li#bio/span.bio',
|
|
38
|
+
# }
|
|
39
|
+
# )
|
|
40
|
+
#
|
|
41
|
+
# == Example return:
|
|
42
|
+
# { :hcard => { :name => 'Mars Phoenix', :location => 'Mars, Solar System', :bio => 'I dig Mars!', :url => 'http://tinyurl.com/5wwaru' } }
|
|
43
|
+
#
|
|
44
|
+
# == Sample HTML (http://delicious.com):
|
|
45
|
+
# <ul id="bookmarklist" class="bookmarks NOTHUMB">
|
|
46
|
+
# <li class="post" id="item-...">
|
|
47
|
+
# <div class="bookmark NOTHUMB">
|
|
48
|
+
# <div class="dateGroup"> <span title="23 APR 08">23 APR 08</span> </div>
|
|
49
|
+
# <div class="data">
|
|
50
|
+
# <h4> <a rel="nofollow" class="taggedlink" href="http://www.cs.biu.ac.il/~koppel/BlogCorpus.htm">Blog Authorship Corpus (Blogger.com 1994)</a>
|
|
51
|
+
# <a class="inlinesave" href="...">SAVE</a> </h4>
|
|
52
|
+
# <h5 class="savers-label"> PEOPLE</h5>
|
|
53
|
+
# <div class="savers savers2"> <a class="delNav" href="/url/7df6661946fca61863312644eb071953"><span class="delNavCount">26</span></a> </div>
|
|
54
|
+
# <div class="description"> The Blog Authorship Corpus consists of the collected posts of 19,320 bloggers gathered from blogger.com in August 2004. The corpus incorporates a total of 681,288 posts and over 140 million words - or approximately 35 posts and 7250 words per person. </div>
|
|
55
|
+
# </div>
|
|
56
|
+
# <div class="meta"></div>
|
|
57
|
+
# <h5 class="tag-chain-label">TAGS</h5>
|
|
58
|
+
# <div class="tagdisplay">
|
|
59
|
+
# <ul class="tag-chain">
|
|
60
|
+
# <li class="tag-chain-item off first"><a class="tag-chain-item-link" rel="tag" href="/infochimps/blog" ><span class="tag-chain-item-span">blog</span> </a></li>
|
|
61
|
+
# <li class="tag-chain-item off"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/corpus" ><span class="tag-chain-item-span">corpus</span> </a></li>
|
|
62
|
+
# <li class="tag-chain-item off"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/analysis" ><span class="tag-chain-item-span">analysis</span></a></li>
|
|
63
|
+
# <li class="tag-chain-item off"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/nlp" ><span class="tag-chain-item-span">nlp</span> </a></li>
|
|
64
|
+
# <li class="tag-chain-item on last"> <a class="tag-chain-item-link" rel="tag" href="/infochimps/dataset" ><span class="tag-chain-item-span">dataset</span> </a></li>
|
|
65
|
+
# </ul>
|
|
66
|
+
# </div>
|
|
67
|
+
# <div class="clr"></div>
|
|
68
|
+
# </div>
|
|
69
|
+
# </li>
|
|
70
|
+
# </ul>
|
|
71
|
+
#
|
|
72
|
+
# == Parser Specification:
|
|
73
|
+
# :bookmarks => [ 'ul#bookmarklist/li.post/.bookmark',
|
|
74
|
+
# {
|
|
75
|
+
# :date => hash( '.dateGroup/span',
|
|
76
|
+
# [:year, :month, :day] => regexp( '', /(\d{2}) ([A-Z]{3}) (\d{2})/),
|
|
77
|
+
# ),
|
|
78
|
+
# :title => '.data/h4/a.taggedlink',
|
|
79
|
+
# :url => attr( '.data/h4/a.taggedlink', 'href'),
|
|
80
|
+
# :del_link_url => href( '.data/.savers/a.delNav),
|
|
81
|
+
# :num_savers => to_i( '.data/.savers//span.delNavCount'),
|
|
82
|
+
# :description => '.data/.description',
|
|
83
|
+
# :tags => ['.tagdisplay//tag-chain-item-span']
|
|
84
|
+
# }
|
|
85
|
+
# ]
|
|
86
|
+
#
|
|
87
|
+
# == Example output:
|
|
88
|
+
# { :bookmarks => [
|
|
89
|
+
# { :date => { :year => '08', :month => 'APR', :day => '23' },
|
|
90
|
+
# :title => 'Blog Authorship Corpus (Blogger.com 1994)',
|
|
91
|
+
# :url => 'http://www.cs.biu.ac.il/~koppel/BlogCorpus.htm',
|
|
92
|
+
# :del_link_url => '/url/7df6661946fca61863312644eb071953',
|
|
93
|
+
# :num_savers => 26,
|
|
94
|
+
# :description => 'The Blog ... ',
|
|
95
|
+
# :tags => ['blog', 'corpus', 'analysis', 'nlp', 'dataset'],
|
|
96
|
+
# }
|
|
97
|
+
# ]}
|
|
98
|
+
#
|
|
99
|
+
# == Implementation:
|
|
100
|
+
#
|
|
101
|
+
# Internally, we take the spec and turn it into a recursive structure of Matcher
|
|
102
|
+
# objects. These consume Hpricot Elements and return the appropriately extracted
|
|
103
|
+
# object.
|
|
104
|
+
#
|
|
105
|
+
# Note that the /default/ is for a bare selector to match ONE element, and to not
|
|
106
|
+
# complain if there are many.
|
|
107
|
+
#
|
|
108
|
+
# Missing elements are silently ignored -- for example if
|
|
109
|
+
# :foo => 'li.missing'
|
|
110
|
+
# there will simply be no :foo element in the hash (as opposed to having hsh[:foo]
|
|
111
|
+
# set to nil -- hsh.include?(foo) will be false)
|
|
112
|
+
#
|
|
113
|
+
#
|
|
114
|
+
# == List of Matchers:
|
|
115
|
+
# { :field => /spec/, ... } # hash hash, each field taken from spec.
|
|
116
|
+
# [ "hpricot_path" ] # 1-el array array: for each element matching
|
|
117
|
+
# hpricot_path, the inner_html
|
|
118
|
+
# [ "hpricot_path", /spec/ ] # 2-el array array: for each element matching
|
|
119
|
+
# hpricot_path, pass to spec
|
|
120
|
+
# "hpricot_path" # string same as one("hpricot_path")
|
|
121
|
+
# one("hpricot_path") # one first match to hpricot_path
|
|
122
|
+
# one("hpricot_path", /spec/) # one applies spec to first match to hpricot_path
|
|
123
|
+
# (these all match on one path:)
|
|
124
|
+
# regexp("hpricot_path", /RE/) # regexp capture groups from matching RE against
|
|
125
|
+
# inner_html of first match to hpricot_path
|
|
126
|
+
# attr("hpricot_path", 'attr_name') # attr
|
|
127
|
+
# href("hpricot_path") # href shorthand for attr(foo, 'href')
|
|
128
|
+
# no_html # strip tags from contents
|
|
129
|
+
# html_encoded # html encode contents
|
|
130
|
+
# to_i, to_f, etc # convert
|
|
131
|
+
# lambda{|doc| ... } # proc calls proc on current doc
|
|
132
|
+
#
|
|
133
|
+
# == Complicated HCard example:
|
|
134
|
+
# :hcards => [ '//ul.users/li.vcard',
|
|
135
|
+
# {
|
|
136
|
+
# :name => '.fn',
|
|
137
|
+
# :address => one('.adr',
|
|
138
|
+
# :street => '.street',
|
|
139
|
+
# :city => '.city',
|
|
140
|
+
# :zip => '.postal'
|
|
141
|
+
# )
|
|
142
|
+
# :tel => [ 'span.tel',
|
|
143
|
+
# {
|
|
144
|
+
# :type => 'span.type',
|
|
145
|
+
# [:cc, :area, :num] => hp.regexp('span.value', /+(\d+).(\d{3})-(\d{3}-\d{4})/),
|
|
146
|
+
# }
|
|
147
|
+
# ]
|
|
148
|
+
# :tags => [ '.tag' ],
|
|
149
|
+
# }
|
|
150
|
+
# ]
|
|
151
|
+
#
|
|
152
|
+
# == Resulting Parser
|
|
153
|
+
# MatchHash({:hcards => MatchArray('//ul.users/li.hcard',
|
|
154
|
+
# MatchHash({
|
|
155
|
+
# :name => MatchFirst('.fn'),
|
|
156
|
+
# :address => MatchFirst('.adr',
|
|
157
|
+
# MatchHash({
|
|
158
|
+
# :street => MatchFirst('.street'),
|
|
159
|
+
# :city => MatchFirst('.locality),
|
|
160
|
+
# :state => MatchFirst('.region),
|
|
161
|
+
# :zip => MatchFirst('.postal'),
|
|
162
|
+
# }))
|
|
163
|
+
# :tel => MatchArray('span.tel',
|
|
164
|
+
# MatchHash({
|
|
165
|
+
# :type => MatchFirst('span.type'),
|
|
166
|
+
# [:cc, :area, :num] => RegexpMatcher('span.value', /+(\d+).(\d{3})-(\d{3}-\d{4})/),
|
|
167
|
+
# })
|
|
168
|
+
# )
|
|
169
|
+
# :tags => MatchArray('.tag'),
|
|
170
|
+
# })
|
|
171
|
+
# )
|
|
172
|
+
#
|
|
173
|
+
# == Example output
|
|
174
|
+
# [
|
|
175
|
+
# {:tel => [ {:type => 'home', :cc => '49', :area => '305', :num => '555-1212'},
|
|
176
|
+
# {:type => 'work', :cc => '49', :area => '305', :num => '555-6969'}, ],
|
|
177
|
+
# :name => "Bob Dobbs, Jr.",
|
|
178
|
+
# :tags => ["church"] },
|
|
179
|
+
# {:tel => [ {:type => 'fax', :cc => '49', :area => '305', :num => '867-5309'}, ],
|
|
180
|
+
# :name => "Jenny",
|
|
181
|
+
# :address => { :street => "53 Evergreen Terr.", :city => "Springfield" },
|
|
182
|
+
# :tags => ["bathroom", "wall"] },
|
|
183
|
+
# ]
|
|
184
|
+
#
|
|
185
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
|
186
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
|
187
|
+
# License:: GPL 3.0
|
|
188
|
+
# Website:: http://infinitemonkeywrench.org/
|
|
189
|
+
#
|
|
190
|
+
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
|
191
|
+
|
|
192
|
+
require 'imw/parsers/html_parser/matchers'
|
|
193
|
+
|
|
194
|
+
class IMW::HTMLParser
|
|
195
|
+
|
|
196
|
+
include IMW::HTMLParserMatcher
|
|
197
|
+
|
|
198
|
+
attr_accessor :parse_tree
|
|
199
|
+
|
|
200
|
+
#
|
|
201
|
+
# Parse Tree
|
|
202
|
+
#
|
|
203
|
+
def initialize arg_spec=nil
|
|
204
|
+
spec = arg_spec || self.class.parser_spec
|
|
205
|
+
self.parse_tree = IMW::HTMLParserMatcher.build_parse_tree(spec)
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
#
|
|
209
|
+
# See IMW::HTMLParser for syntax
|
|
210
|
+
#
|
|
211
|
+
#
|
|
212
|
+
def self.parser_spec
|
|
213
|
+
raise "Override this to create your own parser spec"
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
#
|
|
217
|
+
# Walk
|
|
218
|
+
#
|
|
219
|
+
def parse doc
|
|
220
|
+
self.parse_tree.match(doc)
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
# one("hpricot_path") first match to hpricot_path
|
|
224
|
+
# one("hpricot_path", /spec/) applies spec to first match to hpricot_path
|
|
225
|
+
#
|
|
226
|
+
def self.one selector, matcher
|
|
227
|
+
MatchFirstElement.new(selector, IMW::HTMLParserMatcher.build_parse_tree(matcher))
|
|
228
|
+
end
|
|
229
|
+
# match the +attr+ attribute of the first element given by +selector+
|
|
230
|
+
def self.attr selector, attr, matcher=nil
|
|
231
|
+
MatchAttribute.new(selector, attr, IMW::HTMLParserMatcher.build_parse_tree(matcher))
|
|
232
|
+
end
|
|
233
|
+
# shorthand for +attr(foo, 'href')+
|
|
234
|
+
def self.href selector, matcher=nil
|
|
235
|
+
self.attr(selector, 'href', matcher)
|
|
236
|
+
end
|
|
237
|
+
# shorthand for +attr(foo, 'src')+
|
|
238
|
+
def self.src selector, matcher=nil
|
|
239
|
+
self.attr(selector, 'src', matcher)
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
def self.proc selector, proc, matcher=nil
|
|
243
|
+
MatchProc.new(selector, proc, IMW::HTMLParserMatcher.build_parse_tree(matcher))
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
# strip ","s (!! thus disrespecting locale !!!)
|
|
247
|
+
# and convert to int
|
|
248
|
+
def self.to_num selector, matcher=nil
|
|
249
|
+
proc selector, lambda{|num| num.to_s.gsub(/,/,'').to_i if num }, matcher
|
|
250
|
+
end
|
|
251
|
+
def self.to_json selector, matcher=nil
|
|
252
|
+
proc selector, lambda{|v| v.to_json if v }, matcher
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
def self.strip selector, matcher=nil
|
|
256
|
+
proc selector, lambda{|v| v.strip }, matcher
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
def self.re_group selector, re
|
|
260
|
+
MatchRegexp.new(selector, re)
|
|
261
|
+
end
|
|
262
|
+
def self.re selector, re
|
|
263
|
+
MatchRegexp.new(selector, re, nil, :capture => 1)
|
|
264
|
+
end
|
|
265
|
+
def self.re_all selector, re, matcher=nil
|
|
266
|
+
MatchRegexpRepeatedly.new(selector, re)
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
# def self.plain_text selector, matcher=nil
|
|
270
|
+
# proc selector, lambda{|el| el.inner_text if el }, matcher
|
|
271
|
+
# end
|
|
272
|
+
|
|
273
|
+
# attr_accessor :mapping
|
|
274
|
+
#
|
|
275
|
+
# #
|
|
276
|
+
# # Feed me a hash and I'll semantify HTML
|
|
277
|
+
# #
|
|
278
|
+
# # The hash should magically adhere to the too-complicated,
|
|
279
|
+
# # ever evolving goatrope that works for the below
|
|
280
|
+
# #
|
|
281
|
+
# #
|
|
282
|
+
# def initialize mapping
|
|
283
|
+
# self.mapping = mapping
|
|
284
|
+
# end
|
|
285
|
+
#
|
|
286
|
+
# #
|
|
287
|
+
# # take a document subtree,
|
|
288
|
+
# # and a mapping of hpricot paths to that subtree's data mapping
|
|
289
|
+
# # recursively extract that datamapping
|
|
290
|
+
# #
|
|
291
|
+
# def extract_tree hdoc, content, sub_mapping
|
|
292
|
+
# data = { }
|
|
293
|
+
# sub_mapping.each do |selector, target|
|
|
294
|
+
# data[selector] = []
|
|
295
|
+
# sub_contents = content/selector
|
|
296
|
+
# sub_contents.each do |sub_content|
|
|
297
|
+
# sub_data = {}
|
|
298
|
+
# extract_node hdoc, sub_content, sub_data, selector, target
|
|
299
|
+
# data[selector] << sub_data
|
|
300
|
+
# end
|
|
301
|
+
# end
|
|
302
|
+
# data
|
|
303
|
+
# # end
|
|
304
|
+
# # if selector.is_a?(String)
|
|
305
|
+
# # conts = (content)
|
|
306
|
+
# # else
|
|
307
|
+
# # conts = [content]
|
|
308
|
+
# # end
|
|
309
|
+
# # conts[0..0].each do |content|
|
|
310
|
+
# # extract_node hdoc, content, data, selector, target
|
|
311
|
+
# # end
|
|
312
|
+
# # end
|
|
313
|
+
# data
|
|
314
|
+
# end
|
|
315
|
+
#
|
|
316
|
+
# #
|
|
317
|
+
# # insert the extracted element into the data mapping
|
|
318
|
+
# #
|
|
319
|
+
# def extract_node hdoc, content, data, selector, target
|
|
320
|
+
# classification = classify_node(selector, target)
|
|
321
|
+
# result = \
|
|
322
|
+
# case classification
|
|
323
|
+
# when :subtree
|
|
324
|
+
# target.each do |sub_selector, sub_target|
|
|
325
|
+
# extract_node hdoc, content, data, sub_selector, sub_target
|
|
326
|
+
# end
|
|
327
|
+
#
|
|
328
|
+
# when :sub_attribute
|
|
329
|
+
# k, v = selector.to_a[0]
|
|
330
|
+
# subcontent = (k[0..0] == '/') ? (hdoc.at(k)) : (content.at(k))
|
|
331
|
+
# val = subcontent.attributes[v.to_s] if subcontent
|
|
332
|
+
# data[target] = val unless val.blank?
|
|
333
|
+
#
|
|
334
|
+
# when :attribute then
|
|
335
|
+
# val = content.attributes[selector.to_s]
|
|
336
|
+
# data[target] = val unless val.blank?
|
|
337
|
+
#
|
|
338
|
+
# when :flatten_list
|
|
339
|
+
# subcontents = (selector[0..0] == '/') ? (hdoc/selector) : (content/selector)
|
|
340
|
+
# data[target.first] = subcontents.map{|subcontent| subcontent.inner_html }
|
|
341
|
+
#
|
|
342
|
+
# when :inner_html
|
|
343
|
+
# subcontent = (selector[0..0] == '/') ? (hdoc.at(selector)) : (content.at(selector))
|
|
344
|
+
# data[target] = subcontent.inner_html.strip if subcontent
|
|
345
|
+
#
|
|
346
|
+
# else
|
|
347
|
+
# raise "classify_node shouldn't ever return #{classification}"
|
|
348
|
+
# end
|
|
349
|
+
# # puts "%-19s %-19s %-31s %s" % [target.inspect[0..18], classification.inspect[0..18], selector.inspect[0..30], result.inspect[0..90]] if (classification == :sub_attribute)
|
|
350
|
+
# # puts '' if classification == :subtree
|
|
351
|
+
# end
|
|
352
|
+
#
|
|
353
|
+
# def classify_node selector, target
|
|
354
|
+
# case
|
|
355
|
+
# when target.is_a?(Hash) then :subtree
|
|
356
|
+
# when selector.is_a?(Hash) && (selector.length == 1) then
|
|
357
|
+
# k, v = selector.to_a[0]
|
|
358
|
+
# case v
|
|
359
|
+
# when Symbol then :sub_attribute
|
|
360
|
+
# end
|
|
361
|
+
# when selector.is_a?(Symbol) then :attribute
|
|
362
|
+
# when selector.is_a?(String) && target.is_a?(Array) then :flatten_list
|
|
363
|
+
# when selector.is_a?(String) && target.is_a?(Symbol) then :inner_html
|
|
364
|
+
# else
|
|
365
|
+
# raise "Can't classify mapping: " + [selector, target].join(" - ")
|
|
366
|
+
# end
|
|
367
|
+
# end
|
|
368
|
+
#
|
|
369
|
+
# # use #mapping to parse file
|
|
370
|
+
# def parse link
|
|
371
|
+
# begin hdoc = Hpricot(link.contents)
|
|
372
|
+
# rescue; warn "can't hpricot #{link.to_s}" ; return false; end
|
|
373
|
+
# raw_taggings = extract_tree hdoc, hdoc, self.mapping
|
|
374
|
+
# end
|
|
375
|
+
#
|
|
376
|
+
# # use #mapping to parse file
|
|
377
|
+
# def parse_file filename
|
|
378
|
+
# begin hdoc = Hpricot(File.open(filename))
|
|
379
|
+
# rescue; warn "can't hpricot #{filename}" ; return false; end
|
|
380
|
+
# raw_taggings = extract_tree hdoc, hdoc, self.mapping
|
|
381
|
+
# end
|
|
382
|
+
end
|