imw 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +15 -0
- data/CHANGELOG +0 -0
- data/LICENSE +674 -0
- data/README.rdoc +101 -0
- data/Rakefile +20 -0
- data/VERSION +1 -0
- data/etc/imwrc.rb +76 -0
- data/lib/imw.rb +42 -0
- data/lib/imw/boot.rb +58 -0
- data/lib/imw/dataset.rb +233 -0
- data/lib/imw/dataset/datamapper.rb +66 -0
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
- data/lib/imw/dataset/loaddump.rb +50 -0
- data/lib/imw/dataset/old/file_collection.rb +88 -0
- data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
- data/lib/imw/dataset/scaffold.rb +132 -0
- data/lib/imw/dataset/scraped_uri.rb +305 -0
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
- data/lib/imw/dataset/scrub/scrub.rb +147 -0
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
- data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
- data/lib/imw/dataset/scrub/slug.rb +101 -0
- data/lib/imw/dataset/stats.rb +73 -0
- data/lib/imw/dataset/stats/counter.rb +23 -0
- data/lib/imw/dataset/task.rb +38 -0
- data/lib/imw/dataset/workflow.rb +81 -0
- data/lib/imw/files.rb +110 -0
- data/lib/imw/files/archive.rb +113 -0
- data/lib/imw/files/basicfile.rb +122 -0
- data/lib/imw/files/binary.rb +28 -0
- data/lib/imw/files/compressed_file.rb +93 -0
- data/lib/imw/files/compressed_files_and_archives.rb +348 -0
- data/lib/imw/files/compressible.rb +103 -0
- data/lib/imw/files/csv.rb +112 -0
- data/lib/imw/files/json.rb +41 -0
- data/lib/imw/files/sgml.rb +65 -0
- data/lib/imw/files/text.rb +68 -0
- data/lib/imw/files/yaml.rb +46 -0
- data/lib/imw/packagers.rb +8 -0
- data/lib/imw/packagers/archiver.rb +108 -0
- data/lib/imw/packagers/s3_mover.rb +28 -0
- data/lib/imw/parsers.rb +7 -0
- data/lib/imw/parsers/html_parser.rb +382 -0
- data/lib/imw/parsers/html_parser/matchers.rb +306 -0
- data/lib/imw/parsers/line_parser.rb +87 -0
- data/lib/imw/parsers/regexp_parser.rb +72 -0
- data/lib/imw/utils.rb +24 -0
- data/lib/imw/utils/components.rb +61 -0
- data/lib/imw/utils/config.rb +46 -0
- data/lib/imw/utils/error.rb +54 -0
- data/lib/imw/utils/extensions/array.rb +125 -0
- data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
- data/lib/imw/utils/extensions/core.rb +43 -0
- data/lib/imw/utils/extensions/dir.rb +24 -0
- data/lib/imw/utils/extensions/file_core.rb +64 -0
- data/lib/imw/utils/extensions/hash.rb +218 -0
- data/lib/imw/utils/extensions/hpricot.rb +48 -0
- data/lib/imw/utils/extensions/string.rb +49 -0
- data/lib/imw/utils/extensions/struct.rb +42 -0
- data/lib/imw/utils/extensions/symbol.rb +28 -0
- data/lib/imw/utils/extensions/typed_struct.rb +22 -0
- data/lib/imw/utils/extensions/uri.rb +59 -0
- data/lib/imw/utils/log.rb +67 -0
- data/lib/imw/utils/misc.rb +63 -0
- data/lib/imw/utils/paths.rb +115 -0
- data/lib/imw/utils/uri.rb +59 -0
- data/lib/imw/utils/uuid.rb +33 -0
- data/lib/imw/utils/validate.rb +38 -0
- data/lib/imw/utils/version.rb +12 -0
- data/lib/imw/utils/view.rb +113 -0
- data/lib/imw/utils/view/dump_csv.rb +112 -0
- data/lib/imw/utils/view/dump_csv_older.rb +117 -0
- data/spec/data/sample.csv +131 -0
- data/spec/data/sample.tsv +131 -0
- data/spec/data/sample.txt +131 -0
- data/spec/data/sample.xml +653 -0
- data/spec/data/sample.yaml +652 -0
- data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
- data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
- data/spec/imw/files/archive_spec.rb +118 -0
- data/spec/imw/files/basicfile_spec.rb +121 -0
- data/spec/imw/files/bz2_spec.rb +32 -0
- data/spec/imw/files/compressed_file_spec.rb +96 -0
- data/spec/imw/files/compressible_spec.rb +100 -0
- data/spec/imw/files/file_spec.rb +144 -0
- data/spec/imw/files/gz_spec.rb +32 -0
- data/spec/imw/files/rar_spec.rb +33 -0
- data/spec/imw/files/tar_spec.rb +31 -0
- data/spec/imw/files/text_spec.rb +23 -0
- data/spec/imw/files/zip_spec.rb +31 -0
- data/spec/imw/files_spec.rb +38 -0
- data/spec/imw/packagers/archiver_spec.rb +125 -0
- data/spec/imw/packagers/s3_mover_spec.rb +7 -0
- data/spec/imw/parsers/line_parser_spec.rb +96 -0
- data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
- data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
- data/spec/imw/utils/extensions/find_spec.rb +113 -0
- data/spec/imw/utils/paths_spec.rb +38 -0
- data/spec/imw/workflow/rip/local_spec.rb +89 -0
- data/spec/imw/workflow/rip_spec.rb +27 -0
- data/spec/rcov.opts +1 -0
- data/spec/spec.opts +4 -0
- data/spec/spec_helper.rb +32 -0
- data/spec/support/archive_contents_matcher.rb +94 -0
- data/spec/support/custom_matchers.rb +21 -0
- data/spec/support/directory_contents_matcher.rb +61 -0
- data/spec/support/extensions.rb +18 -0
- data/spec/support/file_contents_matcher.rb +50 -0
- data/spec/support/random.rb +210 -0
- data/spec/support/without_regard_to_order_matcher.rb +58 -0
- metadata +196 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
module IMW
|
|
4
|
+
module URIScrubber
|
|
5
|
+
|
|
6
|
+
def scrubbed
|
|
7
|
+
to_dirpath
|
|
8
|
+
end
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
module Scrub
|
|
13
|
+
#
|
|
14
|
+
# start with a letter, and contain only A-Za-z0-9_
|
|
15
|
+
#
|
|
16
|
+
class SimplifiedURL < Scrub::Generic
|
|
17
|
+
self.complaint = "should follow our zany simplified URL rules: com.host.dot-reversed:schemeifnothttp/path/seg_men-ts/stuff.ext-SHA1ifweird"
|
|
18
|
+
self.validator = %r{#{Addressable::URI::SAFE_CHARS}#{Addressable::URI::RESERVED_CHARS}}u
|
|
19
|
+
self.replacer = ''
|
|
20
|
+
include Scrub::Lowercased
|
|
21
|
+
attr_accessor :uri
|
|
22
|
+
|
|
23
|
+
def valid? str
|
|
24
|
+
str.to_s.downcase == sanitize(str)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def sanitize str
|
|
28
|
+
# if this fails just normalize once, or don't set $KCODE: http://bit.ly/1664vp
|
|
29
|
+
uri = Addressable::URI.heuristic_parse(str.to_s).normalize
|
|
30
|
+
# print [uri.host, uri.host_valid?, uri.path, uri.path_valid?].inspect
|
|
31
|
+
if uri.host_valid?
|
|
32
|
+
uri.scrubbed
|
|
33
|
+
else
|
|
34
|
+
uri.uuid_path
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
require 'scrub'
|
|
4
|
+
require 'scrub_simple_url'
|
|
5
|
+
|
|
6
|
+
test_strings = [
|
|
7
|
+
nil, '', '12', '123', 'simple', 'UPPER', 'CamelCased', 'iden_tifier_23_',
|
|
8
|
+
'twentyfouralphacharslong', 'twentyfiveatozonlyletters', 'hello.-_there@funnychar.com',
|
|
9
|
+
"tab\t", "newline\n",
|
|
10
|
+
"Iñtërnâtiônàlizætiøn",
|
|
11
|
+
'semicolon;', 'quote"', 'tick\'', 'backtick`', 'percent%', 'plus+', 'space ',
|
|
12
|
+
'leftanglebracket<', 'ampersand&',
|
|
13
|
+
"control char-bel\x07",
|
|
14
|
+
"http://foo.bar.com/",
|
|
15
|
+
"HTTP://FOO.BAR.com",
|
|
16
|
+
".com/zazz",
|
|
17
|
+
"scheme://user_name@user_acct:passwd@host-name.museum:9047/path;pathquery/p!a-th~2/path?query=param&query=pa%20ram#fragment",
|
|
18
|
+
"http://web.site.com/path/path/file.ext",
|
|
19
|
+
"ftp://ftp.site.com/path/path/file.ext",
|
|
20
|
+
"/absolute/pathname/file.ext",
|
|
21
|
+
"http://foo.bar.com/.hidden_file_with.ext",
|
|
22
|
+
"http://foo.bar.com/.hidden_file",
|
|
23
|
+
"dir/--/non_alpha_path_segment.ext",
|
|
24
|
+
"http://foo.bar.com/dir/../two_dots_in_path",
|
|
25
|
+
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
scrubbers = {
|
|
30
|
+
# :unicode_title => Scrub::UnicodeTitle.new,
|
|
31
|
+
# :title => Scrub::Title.new,
|
|
32
|
+
# :identifier => Scrub::Identifier.new,
|
|
33
|
+
# :free_text => Scrub::FreeText.new,
|
|
34
|
+
:handle => Scrub::Handle.new,
|
|
35
|
+
:simplified_url => Scrub::SimplifiedURL.new,
|
|
36
|
+
# :domain => Scrub::Domain.new,
|
|
37
|
+
# :email => Scrub::Email.new,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
scrubbers.each do |scrubber_name, scrubber|
|
|
41
|
+
puts scrubber_name
|
|
42
|
+
results = test_strings.map do |test_string|
|
|
43
|
+
[!!scrubber.valid?(test_string), scrubber.sanitize(test_string).inspect, test_string.inspect ]
|
|
44
|
+
end
|
|
45
|
+
results.sort_by{|val,san,orig| val ? 1 : -1 }.each do |val,san,orig|
|
|
46
|
+
puts " %-5s %-30s %-30s" % [val,san,orig]
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# 'foo@bar.com', 'foo@newskool-tld.museum', 'foo@twoletter-tld.de', 'foo@nonexistant-tld.qq',
|
|
53
|
+
# 'r@a.wk', '1234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890@gmail.com',
|
|
54
|
+
# 'hello.-_there@funnychar.com', 'uucp%addr@gmail.com', 'hello+routing-str@gmail.com',
|
|
55
|
+
# 'domain@can.haz.many.sub.doma.in',],
|
|
56
|
+
# :invalid => [nil, '', '!!@nobadchars.com', 'foo@no-rep-dots..com', 'foo@badtld.xxx', 'foo@toolongtld.abcdefg',
|
|
57
|
+
# 'Iñtërnâtiônàlizætiøn@hasnt.happened.to.email', 'need.domain.and.tld@de', "tab\t", "newline\n",
|
|
58
|
+
# 'r@.wk', '1234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890@gmail2.com',
|
|
59
|
+
# # these are technically allowed but not seen in practice:
|
|
60
|
+
# 'uucp!addr@gmail.com', 'semicolon;@gmail.com', 'quote"@gmail.com', 'tick\'@gmail.com', 'backtick`@gmail.com', 'space @gmail.com', 'bracket<@gmail.com', 'bracket>@gmail.com'
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
require 'rubygems'
|
|
2
|
+
require 'addressable/uri'
|
|
3
|
+
require 'uuidtools'
|
|
4
|
+
require 'scrub'
|
|
5
|
+
require 'scrub_simple_url'
|
|
6
|
+
|
|
7
|
+
module IMW
|
|
8
|
+
|
|
9
|
+
#
|
|
10
|
+
#
|
|
11
|
+
# +handle+ -- reasonable effort at a uniq-ish, but human-comprehensible string
|
|
12
|
+
# Handle should only contain the characters A-Za-z0-9_-./
|
|
13
|
+
#
|
|
14
|
+
#
|
|
15
|
+
class Slug
|
|
16
|
+
# A humane representation of the handle ('that-one-time-at_foo')
|
|
17
|
+
attr_reader :handle
|
|
18
|
+
# The purportedly unique string ('')
|
|
19
|
+
attr_accessor :uniqish
|
|
20
|
+
|
|
21
|
+
def initialize handle
|
|
22
|
+
self.handle = handle
|
|
23
|
+
self.uniqish = handle
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
#
|
|
27
|
+
# Unless overridden, use the uniqish to
|
|
28
|
+
# make a name-based UUID within the infochimps.org
|
|
29
|
+
# namespace
|
|
30
|
+
#
|
|
31
|
+
def uuid
|
|
32
|
+
UUID.sha1_create(UUID_URL_NAMESPACE, full_handle)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Handle with only \w characters -- safe for everything there be
|
|
36
|
+
def url_sane
|
|
37
|
+
return '' if !handle
|
|
38
|
+
handle.gsub(/[^\w\/\:]+/, '-').gsub(/_/, '__').gsub(%r{[/:]+}, '_')
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def handle= t
|
|
42
|
+
@handle = self.class.sanitize_handle(t)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Strip all but handle-safe characters
|
|
46
|
+
def self.sanitize_handle t, turd='-'
|
|
47
|
+
t = t.gsub(%r{[^\w\-\./]+}, turd)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
#
|
|
52
|
+
# Uses a URL (that's locator, not URI) as a
|
|
53
|
+
# presumed-uniq identifier.
|
|
54
|
+
#
|
|
55
|
+
# +uniqish+ returns the full normalized URL
|
|
56
|
+
#
|
|
57
|
+
# +handle+ is formed from the dot-reversed host, the scheme (if not http) and a
|
|
58
|
+
# sanitized version of the path. (The query string, fragment, etc are stripped
|
|
59
|
+
# from the handle)
|
|
60
|
+
#
|
|
61
|
+
#
|
|
62
|
+
class URLSlug < Slug
|
|
63
|
+
attr_accessor :url
|
|
64
|
+
def initialize url_str
|
|
65
|
+
self.url = Addressable::URI.heuristic_parse(url_str).normalize
|
|
66
|
+
raise "Bad URL #{url}" unless url.host
|
|
67
|
+
self.uniqish = url.to_s
|
|
68
|
+
self.handle = munge_url
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def uuid
|
|
72
|
+
UUID.sha1_create(UUID_URL_NAMESPACE, full_handle)
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
module Sluggable
|
|
80
|
+
protected
|
|
81
|
+
def create_slug
|
|
82
|
+
"Slugging #{self.attributes}"
|
|
83
|
+
if (self.class.slug_on == :url) || (self.name.blank?)
|
|
84
|
+
slug = IMW::URLSlug.new(self.url)
|
|
85
|
+
self.name = slug.handle
|
|
86
|
+
else
|
|
87
|
+
slug = IMW::Slug.new(self.name)
|
|
88
|
+
end
|
|
89
|
+
self.handle ||= slug.handle
|
|
90
|
+
end
|
|
91
|
+
public
|
|
92
|
+
|
|
93
|
+
def self.included base
|
|
94
|
+
base.before :save, :create_slug
|
|
95
|
+
base.class_eval do
|
|
96
|
+
def self.slug_on s=nil
|
|
97
|
+
@slug_on ||= s
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
#
|
|
2
|
+
# h2. lib/imw/dataset/stats.rb -- statistics for datasets
|
|
3
|
+
#
|
|
4
|
+
# == About
|
|
5
|
+
#
|
|
6
|
+
# Implements methods to calculate very basic statistical properties of
|
|
7
|
+
# a dataset.
|
|
8
|
+
#
|
|
9
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
|
10
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
|
11
|
+
# License:: GPL 3.0
|
|
12
|
+
# Website:: http://infinitemonkeywrench.org/
|
|
13
|
+
#
|
|
14
|
+
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
module IMW
|
|
18
|
+
class Dataset
|
|
19
|
+
#
|
|
20
|
+
# simple histogram
|
|
21
|
+
#
|
|
22
|
+
# Runs down one column/attribute of a dataset
|
|
23
|
+
# returning counts for that column
|
|
24
|
+
#
|
|
25
|
+
def hist slicer
|
|
26
|
+
counts = { }
|
|
27
|
+
els = slice(slicer)
|
|
28
|
+
els.each do |el|
|
|
29
|
+
counts[el] ||= 0
|
|
30
|
+
counts[el] += 1
|
|
31
|
+
end
|
|
32
|
+
self.class.new(counts.map{ |el,ct| [ct,el] })
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def slice slicer
|
|
36
|
+
case
|
|
37
|
+
when slicer.respond_to?(:call) then self.map{ |row| slicer.call(row) }
|
|
38
|
+
else
|
|
39
|
+
self.map{ |row| row[slicer] }
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
#
|
|
44
|
+
# Report
|
|
45
|
+
#
|
|
46
|
+
def report slicer, opts={}
|
|
47
|
+
opts.reverse_merge! :n_top => 20, :hist_args => [], :fmt => "%7d\t%s", :do_hist => true
|
|
48
|
+
counts = hist(slicer)
|
|
49
|
+
report_hist data, counts, slicer, opts if opts[:do_hist]
|
|
50
|
+
report_sizes data, counts, slicer, opts
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def report_sizes data, counts, slicer, opts={}
|
|
54
|
+
fmt = opts[:fmt]
|
|
55
|
+
puts fmt % [counts.length, "unique elements"]
|
|
56
|
+
puts fmt % [data.length, "total elements"]
|
|
57
|
+
puts fmt % [counts.find_all(&:nil?).length, "nil elements"]
|
|
58
|
+
uniqvals = counts.map{|ct,el| el}.reject(&:nil?)
|
|
59
|
+
puts " min:\t#{uniqvals.min}"
|
|
60
|
+
puts " max:\t#{uniqvals.max}"
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Most popular
|
|
64
|
+
def report_hist data, counts, slicer, opts={}
|
|
65
|
+
top = counts.sort_by{|ct,el| ct}[-opts[:n_top]..-1]
|
|
66
|
+
puts "Top #{opts[:n_top]} elements for slice through #{slicer}:"
|
|
67
|
+
puts " -freq-\t-element-"
|
|
68
|
+
puts top.map{ |ct,el| opts[:fmt] % [ct,el] }
|
|
69
|
+
puts "-------\t-------"
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
module IMW
|
|
2
|
+
class RecordCounter < Hash
|
|
3
|
+
def record val
|
|
4
|
+
self[val] ||= 0
|
|
5
|
+
self[val] += 1
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def if_seen val, &block
|
|
9
|
+
if self[val]
|
|
10
|
+
yield
|
|
11
|
+
end
|
|
12
|
+
record val
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def unless_seen val, &block
|
|
16
|
+
unless self[val]
|
|
17
|
+
yield
|
|
18
|
+
end
|
|
19
|
+
record val
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
#
|
|
2
|
+
# h2. lib/imw/workflow/task.rb --
|
|
3
|
+
#
|
|
4
|
+
# == About
|
|
5
|
+
#
|
|
6
|
+
# This file defines a class <tt>IMW::Task</tt> which subclasses
|
|
7
|
+
# <tt>Rake::Task</tt>. Tasks defined in IMW should be instances of
|
|
8
|
+
# <tt>IMW::Task</tt>.
|
|
9
|
+
#
|
|
10
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
|
11
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
|
12
|
+
# License:: GPL 3.0
|
|
13
|
+
# Website:: http://infinitemonkeywrench.org/
|
|
14
|
+
#
|
|
15
|
+
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
|
16
|
+
|
|
17
|
+
require 'rake'
|
|
18
|
+
|
|
19
|
+
module IMW
|
|
20
|
+
|
|
21
|
+
class Task < Rake::Task
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
class Dataset
|
|
25
|
+
include Rake::TaskManager
|
|
26
|
+
|
|
27
|
+
# Return a new (or existing) <tt>IMW::Task</tt> with the given
|
|
28
|
+
# +name+. Dependencies can be declared and a block passed in just
|
|
29
|
+
# as in Rake.
|
|
30
|
+
def task name, &block
|
|
31
|
+
self.define_task IMW::Task, name, &block
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
#
|
|
2
|
+
# lib/imw/workflow.rb -- implements the workflow class
|
|
3
|
+
#
|
|
4
|
+
# == About
|
|
5
|
+
#
|
|
6
|
+
# This file implements the <tt>IMW::Workflow</tt> class which tailors
|
|
7
|
+
# the functionality of Rake for IMW objects.
|
|
8
|
+
#
|
|
9
|
+
# Author:: Philip flip Kromer for infochimps.org (mailto:coders@infochimps.org)
|
|
10
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
|
11
|
+
# License:: GPL 3.0
|
|
12
|
+
# Website:: http://infinitemonkeywrench.org/
|
|
13
|
+
#
|
|
14
|
+
|
|
15
|
+
require 'imw/dataset/scaffold'
|
|
16
|
+
require 'imw/dataset/task'
|
|
17
|
+
|
|
18
|
+
module IMW
|
|
19
|
+
|
|
20
|
+
# The <tt>IMW::Workflow</tt> module is a collection of methods which
|
|
21
|
+
# define Rake[http://rake.rubyforge.org/] tasks specialized for each
|
|
22
|
+
# dataset.
|
|
23
|
+
module Workflow
|
|
24
|
+
|
|
25
|
+
# The functions called here define the default tasks associated
|
|
26
|
+
# with each dataset.
|
|
27
|
+
def create_default_tasks
|
|
28
|
+
create_directories_task
|
|
29
|
+
create_symlinks_task
|
|
30
|
+
create_initialize_task
|
|
31
|
+
create_delete_data_task
|
|
32
|
+
create_destroy_task
|
|
33
|
+
create_workflow_tasks
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Sets the default tasks in this workflow.
|
|
37
|
+
#
|
|
38
|
+
# The default tasks constitute a set of consecutive actions that
|
|
39
|
+
# must be taken in order: <tt>:rip</tt>, <tt>parse</tt>,
|
|
40
|
+
# <tt>munge</tt>, <tt>fix</tt>, and <tt>package</tt>. Each task
|
|
41
|
+
# is a <tt>Rake::Task</tt> which depends on the one before it.
|
|
42
|
+
#
|
|
43
|
+
# Each task does nothing by default other than create directories
|
|
44
|
+
# to hold files for this dataset as it undergoes the workflow.
|
|
45
|
+
def set_default_tasks
|
|
46
|
+
define_task(Rake::Task, {:rip => []})
|
|
47
|
+
define_task(Rake::Task, {:parse => :rip})
|
|
48
|
+
define_task(Rake::Task, {:munge => :parse})
|
|
49
|
+
define_task(Rake::Task, {:fix => :munge})
|
|
50
|
+
define_task(Rake::Task, {:package => :fix})
|
|
51
|
+
comment_default_tasks
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Set the initial comments for each of the default tasks.
|
|
55
|
+
def comment_default_tasks
|
|
56
|
+
self[:rip].comment = "Rip dataset from an origin"
|
|
57
|
+
self[:parse].comment = "Parse dataset into intermediate form"
|
|
58
|
+
self[:munge].comment = "Munge dataset's structure into desired form"
|
|
59
|
+
self[:fix].comment = "Fix and format dataset"
|
|
60
|
+
self[:package].comment = "Package dataset into a final format"
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Creates the task dependency chain <tt>:package => :fix => :munge
|
|
64
|
+
# => :peel => :rip => :initialize</tt>.
|
|
65
|
+
def create_workflow_tasks
|
|
66
|
+
@last_description = "Obtain data from some source."
|
|
67
|
+
define_task(IMW::Task, :rip => [:initialize])
|
|
68
|
+
@last_description = "Extract datafiles from ripped data."
|
|
69
|
+
define_task(IMW::Task, :peel => [:rip])
|
|
70
|
+
@last_description = "Transform records in a dataset."
|
|
71
|
+
define_task(IMW::Task, :munge => [:peel])
|
|
72
|
+
@last_description = "Reconcile records."
|
|
73
|
+
define_task(IMW::Task, :fix => [:munge])
|
|
74
|
+
@last_description = "Package dataset in final form."
|
|
75
|
+
define_task(IMW::Task, :package => [:fix])
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# puts "#{File.basename(__FILE__)}: You find your flow next to a tall tree. Ahhhh."
|
data/lib/imw/files.rb
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
#
|
|
2
|
+
# h2. lib/imw/files.rb -- uniform interface to various files
|
|
3
|
+
#
|
|
4
|
+
# == About
|
|
5
|
+
#
|
|
6
|
+
# Implements <tt>IMW.open</tt> which returns an appropriate +IMW+
|
|
7
|
+
# object given a URI.
|
|
8
|
+
#
|
|
9
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
|
10
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
|
11
|
+
# License:: GPL 3.0
|
|
12
|
+
# Website:: http://infinitemonkeywrench.org/
|
|
13
|
+
#
|
|
14
|
+
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
|
15
|
+
|
|
16
|
+
require 'uri'
|
|
17
|
+
require 'open-uri'
|
|
18
|
+
require 'imw/utils'
|
|
19
|
+
require 'imw/files/basicfile'
|
|
20
|
+
require 'imw/files/archive'
|
|
21
|
+
require 'imw/files/compressible'
|
|
22
|
+
require 'imw/files/compressed_file'
|
|
23
|
+
|
|
24
|
+
module IMW
|
|
25
|
+
|
|
26
|
+
# Parse +path+ and return an appropriate handler. Pass in <tt>:write
|
|
27
|
+
# => true</tt> to open for writing.
|
|
28
|
+
#
|
|
29
|
+
# IMW.open("/tmp/test.csv") # => IMW::Files::Csv("/tmp/test.csv')
|
|
30
|
+
#
|
|
31
|
+
#
|
|
32
|
+
def self.open path, options = {}
|
|
33
|
+
mode = options[:write] ? 'w' : 'r'
|
|
34
|
+
Files.file_class_for(path, options).new(path, mode, options)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def self.open! path, options = {}
|
|
38
|
+
self.open path, options.reverse_merge(:write => true)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
module Files
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# There is certainly a cleaner way to do this.
|
|
45
|
+
autoload :Text, 'imw/files/text'
|
|
46
|
+
autoload :Binary, 'imw/files/binary'
|
|
47
|
+
autoload :Yaml, 'imw/files/yaml'
|
|
48
|
+
autoload :Csv, 'imw/files/csv'
|
|
49
|
+
autoload :Json, 'imw/files/json'
|
|
50
|
+
autoload :Bz2, 'imw/files/compressed_files_and_archives'
|
|
51
|
+
autoload :Gz, 'imw/files/compressed_files_and_archives'
|
|
52
|
+
autoload :Tar, 'imw/files/compressed_files_and_archives'
|
|
53
|
+
autoload :TarBz2, 'imw/files/compressed_files_and_archives'
|
|
54
|
+
autoload :TarGz, 'imw/files/compressed_files_and_archives'
|
|
55
|
+
autoload :Rar, 'imw/files/compressed_files_and_archives'
|
|
56
|
+
autoload :Zip, 'imw/files/compressed_files_and_archives'
|
|
57
|
+
autoload :Xml, 'imw/files/sgml'
|
|
58
|
+
autoload :Html, 'imw/files/sgml'
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# An array used to match files to classes to handle them. The
|
|
62
|
+
# first element of each array is the regexp and the second names
|
|
63
|
+
# the class to handle the file.
|
|
64
|
+
#
|
|
65
|
+
# IMW::Files::EXTENSION_HANDLERS << [ /\.csv$/, :csv ] #=> IMW::Files::Csv
|
|
66
|
+
# IMW::Files::EXTENSION_HANDLERS << [ /\.txt$/, "Text" ] #=> IMW::Files::Text
|
|
67
|
+
# IMW::Files::EXTENSION_HANDLERS << [ /\.myclass%/, MyClass ] #=> MyClass
|
|
68
|
+
#
|
|
69
|
+
# Elements at the end of the array have greater precedence which
|
|
70
|
+
# allows, say, <tt>.tar.gz</tt> to be handled differently from
|
|
71
|
+
# <tt>.gz</tt>.
|
|
72
|
+
EXTENSION_HANDLERS = [
|
|
73
|
+
[/./, :Text], # catchall
|
|
74
|
+
[/\.txt$/, :Text],
|
|
75
|
+
[/\.txt$/, :Text],
|
|
76
|
+
[/\.dat$/, :Text],
|
|
77
|
+
[/\.ascii$/, :Text],
|
|
78
|
+
[/\.yaml$/, :Yaml],
|
|
79
|
+
[/\.yml$/, :Yaml],
|
|
80
|
+
[/\.csv$/, :Csv],
|
|
81
|
+
[/\.tsv$/, :Tsv],
|
|
82
|
+
[/\.json$/, :Json],
|
|
83
|
+
[/\.bz2$/, :Bz2],
|
|
84
|
+
[/\.gz$/, :Gz],
|
|
85
|
+
[/\.tar\.bz2$/, :TarBz2],
|
|
86
|
+
[/\.tbz2$/, :TarBz2],
|
|
87
|
+
[/\.tar\.gz$/, :TarGz],
|
|
88
|
+
[/\.tgz$/, :TarGz],
|
|
89
|
+
[/\.tar$/, :Tar],
|
|
90
|
+
[/\.rar$/, :Rar],
|
|
91
|
+
[/\.zip$/, :Zip],
|
|
92
|
+
[/\.xml$/, :Xml],
|
|
93
|
+
[/\.html$/, :Html],
|
|
94
|
+
[/\.htm$/, :Html]
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
protected
|
|
98
|
+
def self.file_class_for path, options = {}
|
|
99
|
+
klass = options.delete(:as)
|
|
100
|
+
unless klass
|
|
101
|
+
EXTENSION_HANDLERS.reverse_each do |regexp, thing| # end has greater precedence
|
|
102
|
+
next unless regexp =~ path
|
|
103
|
+
klass = thing
|
|
104
|
+
break
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
klass.is_a?(Class) ? klass : class_eval(klass.to_s)
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|