imw 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +15 -0
- data/CHANGELOG +0 -0
- data/LICENSE +674 -0
- data/README.rdoc +101 -0
- data/Rakefile +20 -0
- data/VERSION +1 -0
- data/etc/imwrc.rb +76 -0
- data/lib/imw.rb +42 -0
- data/lib/imw/boot.rb +58 -0
- data/lib/imw/dataset.rb +233 -0
- data/lib/imw/dataset/datamapper.rb +66 -0
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
- data/lib/imw/dataset/loaddump.rb +50 -0
- data/lib/imw/dataset/old/file_collection.rb +88 -0
- data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
- data/lib/imw/dataset/scaffold.rb +132 -0
- data/lib/imw/dataset/scraped_uri.rb +305 -0
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
- data/lib/imw/dataset/scrub/scrub.rb +147 -0
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
- data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
- data/lib/imw/dataset/scrub/slug.rb +101 -0
- data/lib/imw/dataset/stats.rb +73 -0
- data/lib/imw/dataset/stats/counter.rb +23 -0
- data/lib/imw/dataset/task.rb +38 -0
- data/lib/imw/dataset/workflow.rb +81 -0
- data/lib/imw/files.rb +110 -0
- data/lib/imw/files/archive.rb +113 -0
- data/lib/imw/files/basicfile.rb +122 -0
- data/lib/imw/files/binary.rb +28 -0
- data/lib/imw/files/compressed_file.rb +93 -0
- data/lib/imw/files/compressed_files_and_archives.rb +348 -0
- data/lib/imw/files/compressible.rb +103 -0
- data/lib/imw/files/csv.rb +112 -0
- data/lib/imw/files/json.rb +41 -0
- data/lib/imw/files/sgml.rb +65 -0
- data/lib/imw/files/text.rb +68 -0
- data/lib/imw/files/yaml.rb +46 -0
- data/lib/imw/packagers.rb +8 -0
- data/lib/imw/packagers/archiver.rb +108 -0
- data/lib/imw/packagers/s3_mover.rb +28 -0
- data/lib/imw/parsers.rb +7 -0
- data/lib/imw/parsers/html_parser.rb +382 -0
- data/lib/imw/parsers/html_parser/matchers.rb +306 -0
- data/lib/imw/parsers/line_parser.rb +87 -0
- data/lib/imw/parsers/regexp_parser.rb +72 -0
- data/lib/imw/utils.rb +24 -0
- data/lib/imw/utils/components.rb +61 -0
- data/lib/imw/utils/config.rb +46 -0
- data/lib/imw/utils/error.rb +54 -0
- data/lib/imw/utils/extensions/array.rb +125 -0
- data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
- data/lib/imw/utils/extensions/core.rb +43 -0
- data/lib/imw/utils/extensions/dir.rb +24 -0
- data/lib/imw/utils/extensions/file_core.rb +64 -0
- data/lib/imw/utils/extensions/hash.rb +218 -0
- data/lib/imw/utils/extensions/hpricot.rb +48 -0
- data/lib/imw/utils/extensions/string.rb +49 -0
- data/lib/imw/utils/extensions/struct.rb +42 -0
- data/lib/imw/utils/extensions/symbol.rb +28 -0
- data/lib/imw/utils/extensions/typed_struct.rb +22 -0
- data/lib/imw/utils/extensions/uri.rb +59 -0
- data/lib/imw/utils/log.rb +67 -0
- data/lib/imw/utils/misc.rb +63 -0
- data/lib/imw/utils/paths.rb +115 -0
- data/lib/imw/utils/uri.rb +59 -0
- data/lib/imw/utils/uuid.rb +33 -0
- data/lib/imw/utils/validate.rb +38 -0
- data/lib/imw/utils/version.rb +12 -0
- data/lib/imw/utils/view.rb +113 -0
- data/lib/imw/utils/view/dump_csv.rb +112 -0
- data/lib/imw/utils/view/dump_csv_older.rb +117 -0
- data/spec/data/sample.csv +131 -0
- data/spec/data/sample.tsv +131 -0
- data/spec/data/sample.txt +131 -0
- data/spec/data/sample.xml +653 -0
- data/spec/data/sample.yaml +652 -0
- data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
- data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
- data/spec/imw/files/archive_spec.rb +118 -0
- data/spec/imw/files/basicfile_spec.rb +121 -0
- data/spec/imw/files/bz2_spec.rb +32 -0
- data/spec/imw/files/compressed_file_spec.rb +96 -0
- data/spec/imw/files/compressible_spec.rb +100 -0
- data/spec/imw/files/file_spec.rb +144 -0
- data/spec/imw/files/gz_spec.rb +32 -0
- data/spec/imw/files/rar_spec.rb +33 -0
- data/spec/imw/files/tar_spec.rb +31 -0
- data/spec/imw/files/text_spec.rb +23 -0
- data/spec/imw/files/zip_spec.rb +31 -0
- data/spec/imw/files_spec.rb +38 -0
- data/spec/imw/packagers/archiver_spec.rb +125 -0
- data/spec/imw/packagers/s3_mover_spec.rb +7 -0
- data/spec/imw/parsers/line_parser_spec.rb +96 -0
- data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
- data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
- data/spec/imw/utils/extensions/find_spec.rb +113 -0
- data/spec/imw/utils/paths_spec.rb +38 -0
- data/spec/imw/workflow/rip/local_spec.rb +89 -0
- data/spec/imw/workflow/rip_spec.rb +27 -0
- data/spec/rcov.opts +1 -0
- data/spec/spec.opts +4 -0
- data/spec/spec_helper.rb +32 -0
- data/spec/support/archive_contents_matcher.rb +94 -0
- data/spec/support/custom_matchers.rb +21 -0
- data/spec/support/directory_contents_matcher.rb +61 -0
- data/spec/support/extensions.rb +18 -0
- data/spec/support/file_contents_matcher.rb +50 -0
- data/spec/support/random.rb +210 -0
- data/spec/support/without_regard_to_order_matcher.rb +58 -0
- metadata +196 -0
@@ -0,0 +1,38 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module IMW
|
4
|
+
module URIScrubber
|
5
|
+
|
6
|
+
def scrubbed
|
7
|
+
to_dirpath
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
module Scrub
|
13
|
+
#
|
14
|
+
# start with a letter, and contain only A-Za-z0-9_
|
15
|
+
#
|
16
|
+
class SimplifiedURL < Scrub::Generic
|
17
|
+
self.complaint = "should follow our zany simplified URL rules: com.host.dot-reversed:schemeifnothttp/path/seg_men-ts/stuff.ext-SHA1ifweird"
|
18
|
+
self.validator = %r{#{Addressable::URI::SAFE_CHARS}#{Addressable::URI::RESERVED_CHARS}}u
|
19
|
+
self.replacer = ''
|
20
|
+
include Scrub::Lowercased
|
21
|
+
attr_accessor :uri
|
22
|
+
|
23
|
+
def valid? str
|
24
|
+
str.to_s.downcase == sanitize(str)
|
25
|
+
end
|
26
|
+
|
27
|
+
def sanitize str
|
28
|
+
# if this fails just normalize once, or don't set $KCODE: http://bit.ly/1664vp
|
29
|
+
uri = Addressable::URI.heuristic_parse(str.to_s).normalize
|
30
|
+
# print [uri.host, uri.host_valid?, uri.path, uri.path_valid?].inspect
|
31
|
+
if uri.host_valid?
|
32
|
+
uri.scrubbed
|
33
|
+
else
|
34
|
+
uri.uuid_path
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
require 'scrub'
|
4
|
+
require 'scrub_simple_url'
|
5
|
+
|
6
|
+
test_strings = [
|
7
|
+
nil, '', '12', '123', 'simple', 'UPPER', 'CamelCased', 'iden_tifier_23_',
|
8
|
+
'twentyfouralphacharslong', 'twentyfiveatozonlyletters', 'hello.-_there@funnychar.com',
|
9
|
+
"tab\t", "newline\n",
|
10
|
+
"Iñtërnâtiônàlizætiøn",
|
11
|
+
'semicolon;', 'quote"', 'tick\'', 'backtick`', 'percent%', 'plus+', 'space ',
|
12
|
+
'leftanglebracket<', 'ampersand&',
|
13
|
+
"control char-bel\x07",
|
14
|
+
"http://foo.bar.com/",
|
15
|
+
"HTTP://FOO.BAR.com",
|
16
|
+
".com/zazz",
|
17
|
+
"scheme://user_name@user_acct:passwd@host-name.museum:9047/path;pathquery/p!a-th~2/path?query=param&query=pa%20ram#fragment",
|
18
|
+
"http://web.site.com/path/path/file.ext",
|
19
|
+
"ftp://ftp.site.com/path/path/file.ext",
|
20
|
+
"/absolute/pathname/file.ext",
|
21
|
+
"http://foo.bar.com/.hidden_file_with.ext",
|
22
|
+
"http://foo.bar.com/.hidden_file",
|
23
|
+
"dir/--/non_alpha_path_segment.ext",
|
24
|
+
"http://foo.bar.com/dir/../two_dots_in_path",
|
25
|
+
|
26
|
+
]
|
27
|
+
|
28
|
+
|
29
|
+
scrubbers = {
|
30
|
+
# :unicode_title => Scrub::UnicodeTitle.new,
|
31
|
+
# :title => Scrub::Title.new,
|
32
|
+
# :identifier => Scrub::Identifier.new,
|
33
|
+
# :free_text => Scrub::FreeText.new,
|
34
|
+
:handle => Scrub::Handle.new,
|
35
|
+
:simplified_url => Scrub::SimplifiedURL.new,
|
36
|
+
# :domain => Scrub::Domain.new,
|
37
|
+
# :email => Scrub::Email.new,
|
38
|
+
}
|
39
|
+
|
40
|
+
scrubbers.each do |scrubber_name, scrubber|
|
41
|
+
puts scrubber_name
|
42
|
+
results = test_strings.map do |test_string|
|
43
|
+
[!!scrubber.valid?(test_string), scrubber.sanitize(test_string).inspect, test_string.inspect ]
|
44
|
+
end
|
45
|
+
results.sort_by{|val,san,orig| val ? 1 : -1 }.each do |val,san,orig|
|
46
|
+
puts " %-5s %-30s %-30s" % [val,san,orig]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
|
52
|
+
# 'foo@bar.com', 'foo@newskool-tld.museum', 'foo@twoletter-tld.de', 'foo@nonexistant-tld.qq',
|
53
|
+
# 'r@a.wk', '1234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890@gmail.com',
|
54
|
+
# 'hello.-_there@funnychar.com', 'uucp%addr@gmail.com', 'hello+routing-str@gmail.com',
|
55
|
+
# 'domain@can.haz.many.sub.doma.in',],
|
56
|
+
# :invalid => [nil, '', '!!@nobadchars.com', 'foo@no-rep-dots..com', 'foo@badtld.xxx', 'foo@toolongtld.abcdefg',
|
57
|
+
# 'Iñtërnâtiônàlizætiøn@hasnt.happened.to.email', 'need.domain.and.tld@de', "tab\t", "newline\n",
|
58
|
+
# 'r@.wk', '1234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890@gmail2.com',
|
59
|
+
# # these are technically allowed but not seen in practice:
|
60
|
+
# 'uucp!addr@gmail.com', 'semicolon;@gmail.com', 'quote"@gmail.com', 'tick\'@gmail.com', 'backtick`@gmail.com', 'space @gmail.com', 'bracket<@gmail.com', 'bracket>@gmail.com'
|
@@ -0,0 +1,101 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'addressable/uri'
|
3
|
+
require 'uuidtools'
|
4
|
+
require 'scrub'
|
5
|
+
require 'scrub_simple_url'
|
6
|
+
|
7
|
+
module IMW
|
8
|
+
|
9
|
+
#
|
10
|
+
#
|
11
|
+
# +handle+ -- reasonable effort at a uniq-ish, but human-comprehensible string
|
12
|
+
# Handle should only contain the characters A-Za-z0-9_-./
|
13
|
+
#
|
14
|
+
#
|
15
|
+
class Slug
|
16
|
+
# A humane representation of the handle ('that-one-time-at_foo')
|
17
|
+
attr_reader :handle
|
18
|
+
# The purportedly unique string ('')
|
19
|
+
attr_accessor :uniqish
|
20
|
+
|
21
|
+
def initialize handle
|
22
|
+
self.handle = handle
|
23
|
+
self.uniqish = handle
|
24
|
+
end
|
25
|
+
|
26
|
+
#
|
27
|
+
# Unless overridden, use the uniqish to
|
28
|
+
# make a name-based UUID within the infochimps.org
|
29
|
+
# namespace
|
30
|
+
#
|
31
|
+
def uuid
|
32
|
+
UUID.sha1_create(UUID_URL_NAMESPACE, full_handle)
|
33
|
+
end
|
34
|
+
|
35
|
+
# Handle with only \w characters -- safe for everything there be
|
36
|
+
def url_sane
|
37
|
+
return '' if !handle
|
38
|
+
handle.gsub(/[^\w\/\:]+/, '-').gsub(/_/, '__').gsub(%r{[/:]+}, '_')
|
39
|
+
end
|
40
|
+
|
41
|
+
def handle= t
|
42
|
+
@handle = self.class.sanitize_handle(t)
|
43
|
+
end
|
44
|
+
|
45
|
+
# Strip all but handle-safe characters
|
46
|
+
def self.sanitize_handle t, turd='-'
|
47
|
+
t = t.gsub(%r{[^\w\-\./]+}, turd)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
#
|
52
|
+
# Uses a URL (that's locator, not URI) as a
|
53
|
+
# presumed-uniq identifier.
|
54
|
+
#
|
55
|
+
# +uniqish+ returns the full normalized URL
|
56
|
+
#
|
57
|
+
# +handle+ is formed from the dot-reversed host, the scheme (if not http) and a
|
58
|
+
# sanitized version of the path. (The query string, fragment, etc are stripped
|
59
|
+
# from the handle)
|
60
|
+
#
|
61
|
+
#
|
62
|
+
class URLSlug < Slug
|
63
|
+
attr_accessor :url
|
64
|
+
def initialize url_str
|
65
|
+
self.url = Addressable::URI.heuristic_parse(url_str).normalize
|
66
|
+
raise "Bad URL #{url}" unless url.host
|
67
|
+
self.uniqish = url.to_s
|
68
|
+
self.handle = munge_url
|
69
|
+
end
|
70
|
+
|
71
|
+
def uuid
|
72
|
+
UUID.sha1_create(UUID_URL_NAMESPACE, full_handle)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
|
79
|
+
module Sluggable
|
80
|
+
protected
|
81
|
+
def create_slug
|
82
|
+
"Slugging #{self.attributes}"
|
83
|
+
if (self.class.slug_on == :url) || (self.name.blank?)
|
84
|
+
slug = IMW::URLSlug.new(self.url)
|
85
|
+
self.name = slug.handle
|
86
|
+
else
|
87
|
+
slug = IMW::Slug.new(self.name)
|
88
|
+
end
|
89
|
+
self.handle ||= slug.handle
|
90
|
+
end
|
91
|
+
public
|
92
|
+
|
93
|
+
def self.included base
|
94
|
+
base.before :save, :create_slug
|
95
|
+
base.class_eval do
|
96
|
+
def self.slug_on s=nil
|
97
|
+
@slug_on ||= s
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
#
|
2
|
+
# h2. lib/imw/dataset/stats.rb -- statistics for datasets
|
3
|
+
#
|
4
|
+
# == About
|
5
|
+
#
|
6
|
+
# Implements methods to calculate very basic statistical properties of
|
7
|
+
# a dataset.
|
8
|
+
#
|
9
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
10
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
11
|
+
# License:: GPL 3.0
|
12
|
+
# Website:: http://infinitemonkeywrench.org/
|
13
|
+
#
|
14
|
+
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
15
|
+
|
16
|
+
|
17
|
+
module IMW
|
18
|
+
class Dataset
|
19
|
+
#
|
20
|
+
# simple histogram
|
21
|
+
#
|
22
|
+
# Runs down one column/attribute of a dataset
|
23
|
+
# returning counts for that column
|
24
|
+
#
|
25
|
+
def hist slicer
|
26
|
+
counts = { }
|
27
|
+
els = slice(slicer)
|
28
|
+
els.each do |el|
|
29
|
+
counts[el] ||= 0
|
30
|
+
counts[el] += 1
|
31
|
+
end
|
32
|
+
self.class.new(counts.map{ |el,ct| [ct,el] })
|
33
|
+
end
|
34
|
+
|
35
|
+
def slice slicer
|
36
|
+
case
|
37
|
+
when slicer.respond_to?(:call) then self.map{ |row| slicer.call(row) }
|
38
|
+
else
|
39
|
+
self.map{ |row| row[slicer] }
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
#
|
44
|
+
# Report
|
45
|
+
#
|
46
|
+
def report slicer, opts={}
|
47
|
+
opts.reverse_merge! :n_top => 20, :hist_args => [], :fmt => "%7d\t%s", :do_hist => true
|
48
|
+
counts = hist(slicer)
|
49
|
+
report_hist data, counts, slicer, opts if opts[:do_hist]
|
50
|
+
report_sizes data, counts, slicer, opts
|
51
|
+
end
|
52
|
+
|
53
|
+
def report_sizes data, counts, slicer, opts={}
|
54
|
+
fmt = opts[:fmt]
|
55
|
+
puts fmt % [counts.length, "unique elements"]
|
56
|
+
puts fmt % [data.length, "total elements"]
|
57
|
+
puts fmt % [counts.find_all(&:nil?).length, "nil elements"]
|
58
|
+
uniqvals = counts.map{|ct,el| el}.reject(&:nil?)
|
59
|
+
puts " min:\t#{uniqvals.min}"
|
60
|
+
puts " max:\t#{uniqvals.max}"
|
61
|
+
end
|
62
|
+
|
63
|
+
# Most popular
|
64
|
+
def report_hist data, counts, slicer, opts={}
|
65
|
+
top = counts.sort_by{|ct,el| ct}[-opts[:n_top]..-1]
|
66
|
+
puts "Top #{opts[:n_top]} elements for slice through #{slicer}:"
|
67
|
+
puts " -freq-\t-element-"
|
68
|
+
puts top.map{ |ct,el| opts[:fmt] % [ct,el] }
|
69
|
+
puts "-------\t-------"
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module IMW
|
2
|
+
class RecordCounter < Hash
|
3
|
+
def record val
|
4
|
+
self[val] ||= 0
|
5
|
+
self[val] += 1
|
6
|
+
end
|
7
|
+
|
8
|
+
def if_seen val, &block
|
9
|
+
if self[val]
|
10
|
+
yield
|
11
|
+
end
|
12
|
+
record val
|
13
|
+
end
|
14
|
+
|
15
|
+
def unless_seen val, &block
|
16
|
+
unless self[val]
|
17
|
+
yield
|
18
|
+
end
|
19
|
+
record val
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
#
|
2
|
+
# h2. lib/imw/workflow/task.rb --
|
3
|
+
#
|
4
|
+
# == About
|
5
|
+
#
|
6
|
+
# This file defines a class <tt>IMW::Task</tt> which subclasses
|
7
|
+
# <tt>Rake::Task</tt>. Tasks defined in IMW should be instances of
|
8
|
+
# <tt>IMW::Task</tt>.
|
9
|
+
#
|
10
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
11
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
12
|
+
# License:: GPL 3.0
|
13
|
+
# Website:: http://infinitemonkeywrench.org/
|
14
|
+
#
|
15
|
+
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
16
|
+
|
17
|
+
require 'rake'
|
18
|
+
|
19
|
+
module IMW
|
20
|
+
|
21
|
+
class Task < Rake::Task
|
22
|
+
end
|
23
|
+
|
24
|
+
class Dataset
|
25
|
+
include Rake::TaskManager
|
26
|
+
|
27
|
+
# Return a new (or existing) <tt>IMW::Task</tt> with the given
|
28
|
+
# +name+. Dependencies can be declared and a block passed in just
|
29
|
+
# as in Rake.
|
30
|
+
def task name, &block
|
31
|
+
self.define_task IMW::Task, name, &block
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
|
@@ -0,0 +1,81 @@
|
|
1
|
+
#
|
2
|
+
# lib/imw/workflow.rb -- implements the workflow class
|
3
|
+
#
|
4
|
+
# == About
|
5
|
+
#
|
6
|
+
# This file implements the <tt>IMW::Workflow</tt> class which tailors
|
7
|
+
# the functionality of Rake for IMW objects.
|
8
|
+
#
|
9
|
+
# Author:: Philip flip Kromer for infochimps.org (mailto:coders@infochimps.org)
|
10
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
11
|
+
# License:: GPL 3.0
|
12
|
+
# Website:: http://infinitemonkeywrench.org/
|
13
|
+
#
|
14
|
+
|
15
|
+
require 'imw/dataset/scaffold'
|
16
|
+
require 'imw/dataset/task'
|
17
|
+
|
18
|
+
module IMW
|
19
|
+
|
20
|
+
# The <tt>IMW::Workflow</tt> module is a collection of methods which
|
21
|
+
# define Rake[http://rake.rubyforge.org/] tasks specialized for each
|
22
|
+
# dataset.
|
23
|
+
module Workflow
|
24
|
+
|
25
|
+
# The functions called here define the default tasks associated
|
26
|
+
# with each dataset.
|
27
|
+
def create_default_tasks
|
28
|
+
create_directories_task
|
29
|
+
create_symlinks_task
|
30
|
+
create_initialize_task
|
31
|
+
create_delete_data_task
|
32
|
+
create_destroy_task
|
33
|
+
create_workflow_tasks
|
34
|
+
end
|
35
|
+
|
36
|
+
# Sets the default tasks in this workflow.
|
37
|
+
#
|
38
|
+
# The default tasks constitute a set of consecutive actions that
|
39
|
+
# must be taken in order: <tt>:rip</tt>, <tt>parse</tt>,
|
40
|
+
# <tt>munge</tt>, <tt>fix</tt>, and <tt>package</tt>. Each task
|
41
|
+
# is a <tt>Rake::Task</tt> which depends on the one before it.
|
42
|
+
#
|
43
|
+
# Each task does nothing by default other than create directories
|
44
|
+
# to hold files for this dataset as it undergoes the workflow.
|
45
|
+
def set_default_tasks
|
46
|
+
define_task(Rake::Task, {:rip => []})
|
47
|
+
define_task(Rake::Task, {:parse => :rip})
|
48
|
+
define_task(Rake::Task, {:munge => :parse})
|
49
|
+
define_task(Rake::Task, {:fix => :munge})
|
50
|
+
define_task(Rake::Task, {:package => :fix})
|
51
|
+
comment_default_tasks
|
52
|
+
end
|
53
|
+
|
54
|
+
# Set the initial comments for each of the default tasks.
|
55
|
+
def comment_default_tasks
|
56
|
+
self[:rip].comment = "Rip dataset from an origin"
|
57
|
+
self[:parse].comment = "Parse dataset into intermediate form"
|
58
|
+
self[:munge].comment = "Munge dataset's structure into desired form"
|
59
|
+
self[:fix].comment = "Fix and format dataset"
|
60
|
+
self[:package].comment = "Package dataset into a final format"
|
61
|
+
end
|
62
|
+
|
63
|
+
# Creates the task dependency chain <tt>:package => :fix => :munge
|
64
|
+
# => :peel => :rip => :initialize</tt>.
|
65
|
+
def create_workflow_tasks
|
66
|
+
@last_description = "Obtain data from some source."
|
67
|
+
define_task(IMW::Task, :rip => [:initialize])
|
68
|
+
@last_description = "Extract datafiles from ripped data."
|
69
|
+
define_task(IMW::Task, :peel => [:rip])
|
70
|
+
@last_description = "Transform records in a dataset."
|
71
|
+
define_task(IMW::Task, :munge => [:peel])
|
72
|
+
@last_description = "Reconcile records."
|
73
|
+
define_task(IMW::Task, :fix => [:munge])
|
74
|
+
@last_description = "Package dataset in final form."
|
75
|
+
define_task(IMW::Task, :package => [:fix])
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# puts "#{File.basename(__FILE__)}: You find your flow next to a tall tree. Ahhhh."
|
data/lib/imw/files.rb
ADDED
@@ -0,0 +1,110 @@
|
|
1
|
+
#
|
2
|
+
# h2. lib/imw/files.rb -- uniform interface to various files
|
3
|
+
#
|
4
|
+
# == About
|
5
|
+
#
|
6
|
+
# Implements <tt>IMW.open</tt> which returns an appropriate +IMW+
|
7
|
+
# object given a URI.
|
8
|
+
#
|
9
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
10
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
11
|
+
# License:: GPL 3.0
|
12
|
+
# Website:: http://infinitemonkeywrench.org/
|
13
|
+
#
|
14
|
+
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
15
|
+
|
16
|
+
require 'uri'
|
17
|
+
require 'open-uri'
|
18
|
+
require 'imw/utils'
|
19
|
+
require 'imw/files/basicfile'
|
20
|
+
require 'imw/files/archive'
|
21
|
+
require 'imw/files/compressible'
|
22
|
+
require 'imw/files/compressed_file'
|
23
|
+
|
24
|
+
module IMW
|
25
|
+
|
26
|
+
# Parse +path+ and return an appropriate handler. Pass in <tt>:write
|
27
|
+
# => true</tt> to open for writing.
|
28
|
+
#
|
29
|
+
# IMW.open("/tmp/test.csv") # => IMW::Files::Csv("/tmp/test.csv')
|
30
|
+
#
|
31
|
+
#
|
32
|
+
def self.open path, options = {}
|
33
|
+
mode = options[:write] ? 'w' : 'r'
|
34
|
+
Files.file_class_for(path, options).new(path, mode, options)
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.open! path, options = {}
|
38
|
+
self.open path, options.reverse_merge(:write => true)
|
39
|
+
end
|
40
|
+
|
41
|
+
module Files
|
42
|
+
|
43
|
+
|
44
|
+
# There is certainly a cleaner way to do this.
|
45
|
+
autoload :Text, 'imw/files/text'
|
46
|
+
autoload :Binary, 'imw/files/binary'
|
47
|
+
autoload :Yaml, 'imw/files/yaml'
|
48
|
+
autoload :Csv, 'imw/files/csv'
|
49
|
+
autoload :Json, 'imw/files/json'
|
50
|
+
autoload :Bz2, 'imw/files/compressed_files_and_archives'
|
51
|
+
autoload :Gz, 'imw/files/compressed_files_and_archives'
|
52
|
+
autoload :Tar, 'imw/files/compressed_files_and_archives'
|
53
|
+
autoload :TarBz2, 'imw/files/compressed_files_and_archives'
|
54
|
+
autoload :TarGz, 'imw/files/compressed_files_and_archives'
|
55
|
+
autoload :Rar, 'imw/files/compressed_files_and_archives'
|
56
|
+
autoload :Zip, 'imw/files/compressed_files_and_archives'
|
57
|
+
autoload :Xml, 'imw/files/sgml'
|
58
|
+
autoload :Html, 'imw/files/sgml'
|
59
|
+
|
60
|
+
|
61
|
+
# An array used to match files to classes to handle them. The
|
62
|
+
# first element of each array is the regexp and the second names
|
63
|
+
# the class to handle the file.
|
64
|
+
#
|
65
|
+
# IMW::Files::EXTENSION_HANDLERS << [ /\.csv$/, :csv ] #=> IMW::Files::Csv
|
66
|
+
# IMW::Files::EXTENSION_HANDLERS << [ /\.txt$/, "Text" ] #=> IMW::Files::Text
|
67
|
+
# IMW::Files::EXTENSION_HANDLERS << [ /\.myclass%/, MyClass ] #=> MyClass
|
68
|
+
#
|
69
|
+
# Elements at the end of the array have greater precedence which
|
70
|
+
# allows, say, <tt>.tar.gz</tt> to be handled differently from
|
71
|
+
# <tt>.gz</tt>.
|
72
|
+
EXTENSION_HANDLERS = [
|
73
|
+
[/./, :Text], # catchall
|
74
|
+
[/\.txt$/, :Text],
|
75
|
+
[/\.txt$/, :Text],
|
76
|
+
[/\.dat$/, :Text],
|
77
|
+
[/\.ascii$/, :Text],
|
78
|
+
[/\.yaml$/, :Yaml],
|
79
|
+
[/\.yml$/, :Yaml],
|
80
|
+
[/\.csv$/, :Csv],
|
81
|
+
[/\.tsv$/, :Tsv],
|
82
|
+
[/\.json$/, :Json],
|
83
|
+
[/\.bz2$/, :Bz2],
|
84
|
+
[/\.gz$/, :Gz],
|
85
|
+
[/\.tar\.bz2$/, :TarBz2],
|
86
|
+
[/\.tbz2$/, :TarBz2],
|
87
|
+
[/\.tar\.gz$/, :TarGz],
|
88
|
+
[/\.tgz$/, :TarGz],
|
89
|
+
[/\.tar$/, :Tar],
|
90
|
+
[/\.rar$/, :Rar],
|
91
|
+
[/\.zip$/, :Zip],
|
92
|
+
[/\.xml$/, :Xml],
|
93
|
+
[/\.html$/, :Html],
|
94
|
+
[/\.htm$/, :Html]
|
95
|
+
]
|
96
|
+
|
97
|
+
protected
|
98
|
+
def self.file_class_for path, options = {}
|
99
|
+
klass = options.delete(:as)
|
100
|
+
unless klass
|
101
|
+
EXTENSION_HANDLERS.reverse_each do |regexp, thing| # end has greater precedence
|
102
|
+
next unless regexp =~ path
|
103
|
+
klass = thing
|
104
|
+
break
|
105
|
+
end
|
106
|
+
end
|
107
|
+
klass.is_a?(Class) ? klass : class_eval(klass.to_s)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|