imw 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. data/.gitignore +15 -0
  2. data/CHANGELOG +0 -0
  3. data/LICENSE +674 -0
  4. data/README.rdoc +101 -0
  5. data/Rakefile +20 -0
  6. data/VERSION +1 -0
  7. data/etc/imwrc.rb +76 -0
  8. data/lib/imw.rb +42 -0
  9. data/lib/imw/boot.rb +58 -0
  10. data/lib/imw/dataset.rb +233 -0
  11. data/lib/imw/dataset/datamapper.rb +66 -0
  12. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
  13. data/lib/imw/dataset/loaddump.rb +50 -0
  14. data/lib/imw/dataset/old/file_collection.rb +88 -0
  15. data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
  16. data/lib/imw/dataset/scaffold.rb +132 -0
  17. data/lib/imw/dataset/scraped_uri.rb +305 -0
  18. data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
  19. data/lib/imw/dataset/scrub/scrub.rb +147 -0
  20. data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
  21. data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
  22. data/lib/imw/dataset/scrub/slug.rb +101 -0
  23. data/lib/imw/dataset/stats.rb +73 -0
  24. data/lib/imw/dataset/stats/counter.rb +23 -0
  25. data/lib/imw/dataset/task.rb +38 -0
  26. data/lib/imw/dataset/workflow.rb +81 -0
  27. data/lib/imw/files.rb +110 -0
  28. data/lib/imw/files/archive.rb +113 -0
  29. data/lib/imw/files/basicfile.rb +122 -0
  30. data/lib/imw/files/binary.rb +28 -0
  31. data/lib/imw/files/compressed_file.rb +93 -0
  32. data/lib/imw/files/compressed_files_and_archives.rb +348 -0
  33. data/lib/imw/files/compressible.rb +103 -0
  34. data/lib/imw/files/csv.rb +112 -0
  35. data/lib/imw/files/json.rb +41 -0
  36. data/lib/imw/files/sgml.rb +65 -0
  37. data/lib/imw/files/text.rb +68 -0
  38. data/lib/imw/files/yaml.rb +46 -0
  39. data/lib/imw/packagers.rb +8 -0
  40. data/lib/imw/packagers/archiver.rb +108 -0
  41. data/lib/imw/packagers/s3_mover.rb +28 -0
  42. data/lib/imw/parsers.rb +7 -0
  43. data/lib/imw/parsers/html_parser.rb +382 -0
  44. data/lib/imw/parsers/html_parser/matchers.rb +306 -0
  45. data/lib/imw/parsers/line_parser.rb +87 -0
  46. data/lib/imw/parsers/regexp_parser.rb +72 -0
  47. data/lib/imw/utils.rb +24 -0
  48. data/lib/imw/utils/components.rb +61 -0
  49. data/lib/imw/utils/config.rb +46 -0
  50. data/lib/imw/utils/error.rb +54 -0
  51. data/lib/imw/utils/extensions/array.rb +125 -0
  52. data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
  53. data/lib/imw/utils/extensions/core.rb +43 -0
  54. data/lib/imw/utils/extensions/dir.rb +24 -0
  55. data/lib/imw/utils/extensions/file_core.rb +64 -0
  56. data/lib/imw/utils/extensions/hash.rb +218 -0
  57. data/lib/imw/utils/extensions/hpricot.rb +48 -0
  58. data/lib/imw/utils/extensions/string.rb +49 -0
  59. data/lib/imw/utils/extensions/struct.rb +42 -0
  60. data/lib/imw/utils/extensions/symbol.rb +28 -0
  61. data/lib/imw/utils/extensions/typed_struct.rb +22 -0
  62. data/lib/imw/utils/extensions/uri.rb +59 -0
  63. data/lib/imw/utils/log.rb +67 -0
  64. data/lib/imw/utils/misc.rb +63 -0
  65. data/lib/imw/utils/paths.rb +115 -0
  66. data/lib/imw/utils/uri.rb +59 -0
  67. data/lib/imw/utils/uuid.rb +33 -0
  68. data/lib/imw/utils/validate.rb +38 -0
  69. data/lib/imw/utils/version.rb +12 -0
  70. data/lib/imw/utils/view.rb +113 -0
  71. data/lib/imw/utils/view/dump_csv.rb +112 -0
  72. data/lib/imw/utils/view/dump_csv_older.rb +117 -0
  73. data/spec/data/sample.csv +131 -0
  74. data/spec/data/sample.tsv +131 -0
  75. data/spec/data/sample.txt +131 -0
  76. data/spec/data/sample.xml +653 -0
  77. data/spec/data/sample.yaml +652 -0
  78. data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
  79. data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
  80. data/spec/imw/files/archive_spec.rb +118 -0
  81. data/spec/imw/files/basicfile_spec.rb +121 -0
  82. data/spec/imw/files/bz2_spec.rb +32 -0
  83. data/spec/imw/files/compressed_file_spec.rb +96 -0
  84. data/spec/imw/files/compressible_spec.rb +100 -0
  85. data/spec/imw/files/file_spec.rb +144 -0
  86. data/spec/imw/files/gz_spec.rb +32 -0
  87. data/spec/imw/files/rar_spec.rb +33 -0
  88. data/spec/imw/files/tar_spec.rb +31 -0
  89. data/spec/imw/files/text_spec.rb +23 -0
  90. data/spec/imw/files/zip_spec.rb +31 -0
  91. data/spec/imw/files_spec.rb +38 -0
  92. data/spec/imw/packagers/archiver_spec.rb +125 -0
  93. data/spec/imw/packagers/s3_mover_spec.rb +7 -0
  94. data/spec/imw/parsers/line_parser_spec.rb +96 -0
  95. data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
  96. data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
  97. data/spec/imw/utils/extensions/find_spec.rb +113 -0
  98. data/spec/imw/utils/paths_spec.rb +38 -0
  99. data/spec/imw/workflow/rip/local_spec.rb +89 -0
  100. data/spec/imw/workflow/rip_spec.rb +27 -0
  101. data/spec/rcov.opts +1 -0
  102. data/spec/spec.opts +4 -0
  103. data/spec/spec_helper.rb +32 -0
  104. data/spec/support/archive_contents_matcher.rb +94 -0
  105. data/spec/support/custom_matchers.rb +21 -0
  106. data/spec/support/directory_contents_matcher.rb +61 -0
  107. data/spec/support/extensions.rb +18 -0
  108. data/spec/support/file_contents_matcher.rb +50 -0
  109. data/spec/support/random.rb +210 -0
  110. data/spec/support/without_regard_to_order_matcher.rb +58 -0
  111. metadata +196 -0
@@ -0,0 +1,38 @@
1
+
2
+
3
+ module IMW
4
+ module URIScrubber
5
+
6
+ def scrubbed
7
+ to_dirpath
8
+ end
9
+ end
10
+ end
11
+
12
+ module Scrub
13
+ #
14
+ # start with a letter, and contain only A-Za-z0-9_
15
+ #
16
+ class SimplifiedURL < Scrub::Generic
17
+ self.complaint = "should follow our zany simplified URL rules: com.host.dot-reversed:schemeifnothttp/path/seg_men-ts/stuff.ext-SHA1ifweird"
18
+ self.validator = %r{#{Addressable::URI::SAFE_CHARS}#{Addressable::URI::RESERVED_CHARS}}u
19
+ self.replacer = ''
20
+ include Scrub::Lowercased
21
+ attr_accessor :uri
22
+
23
+ def valid? str
24
+ str.to_s.downcase == sanitize(str)
25
+ end
26
+
27
+ def sanitize str
28
+ # if this fails just normalize once, or don't set $KCODE: http://bit.ly/1664vp
29
+ uri = Addressable::URI.heuristic_parse(str.to_s).normalize
30
+ # print [uri.host, uri.host_valid?, uri.path, uri.path_valid?].inspect
31
+ if uri.host_valid?
32
+ uri.scrubbed
33
+ else
34
+ uri.uuid_path
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+ require 'scrub'
4
+ require 'scrub_simple_url'
5
+
6
+ test_strings = [
7
+ nil, '', '12', '123', 'simple', 'UPPER', 'CamelCased', 'iden_tifier_23_',
8
+ 'twentyfouralphacharslong', 'twentyfiveatozonlyletters', 'hello.-_there@funnychar.com',
9
+ "tab\t", "newline\n",
10
+ "Iñtërnâtiônàlizætiøn",
11
+ 'semicolon;', 'quote"', 'tick\'', 'backtick`', 'percent%', 'plus+', 'space ',
12
+ 'leftanglebracket<', 'ampersand&',
13
+ "control char-bel\x07",
14
+ "http://foo.bar.com/",
15
+ "HTTP://FOO.BAR.com",
16
+ ".com/zazz",
17
+ "scheme://user_name@user_acct:passwd@host-name.museum:9047/path;pathquery/p!a-th~2/path?query=param&amp;query=pa%20ram#fragment",
18
+ "http://web.site.com/path/path/file.ext",
19
+ "ftp://ftp.site.com/path/path/file.ext",
20
+ "/absolute/pathname/file.ext",
21
+ "http://foo.bar.com/.hidden_file_with.ext",
22
+ "http://foo.bar.com/.hidden_file",
23
+ "dir/--/non_alpha_path_segment.ext",
24
+ "http://foo.bar.com/dir/../two_dots_in_path",
25
+
26
+ ]
27
+
28
+
29
+ scrubbers = {
30
+ # :unicode_title => Scrub::UnicodeTitle.new,
31
+ # :title => Scrub::Title.new,
32
+ # :identifier => Scrub::Identifier.new,
33
+ # :free_text => Scrub::FreeText.new,
34
+ :handle => Scrub::Handle.new,
35
+ :simplified_url => Scrub::SimplifiedURL.new,
36
+ # :domain => Scrub::Domain.new,
37
+ # :email => Scrub::Email.new,
38
+ }
39
+
40
+ scrubbers.each do |scrubber_name, scrubber|
41
+ puts scrubber_name
42
+ results = test_strings.map do |test_string|
43
+ [!!scrubber.valid?(test_string), scrubber.sanitize(test_string).inspect, test_string.inspect ]
44
+ end
45
+ results.sort_by{|val,san,orig| val ? 1 : -1 }.each do |val,san,orig|
46
+ puts " %-5s %-30s %-30s" % [val,san,orig]
47
+ end
48
+ end
49
+
50
+
51
+
52
+ # 'foo@bar.com', 'foo@newskool-tld.museum', 'foo@twoletter-tld.de', 'foo@nonexistant-tld.qq',
53
+ # 'r@a.wk', '1234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890@gmail.com',
54
+ # 'hello.-_there@funnychar.com', 'uucp%addr@gmail.com', 'hello+routing-str@gmail.com',
55
+ # 'domain@can.haz.many.sub.doma.in',],
56
+ # :invalid => [nil, '', '!!@nobadchars.com', 'foo@no-rep-dots..com', 'foo@badtld.xxx', 'foo@toolongtld.abcdefg',
57
+ # 'Iñtërnâtiônàlizætiøn@hasnt.happened.to.email', 'need.domain.and.tld@de', "tab\t", "newline\n",
58
+ # 'r@.wk', '1234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890@gmail2.com',
59
+ # # these are technically allowed but not seen in practice:
60
+ # 'uucp!addr@gmail.com', 'semicolon;@gmail.com', 'quote"@gmail.com', 'tick\'@gmail.com', 'backtick`@gmail.com', 'space @gmail.com', 'bracket<@gmail.com', 'bracket>@gmail.com'
@@ -0,0 +1,101 @@
1
+ require 'rubygems'
2
+ require 'addressable/uri'
3
+ require 'uuidtools'
4
+ require 'scrub'
5
+ require 'scrub_simple_url'
6
+
7
+ module IMW
8
+
9
+ #
10
+ #
11
+ # +handle+ -- reasonable effort at a uniq-ish, but human-comprehensible string
12
+ # Handle should only contain the characters A-Za-z0-9_-./
13
+ #
14
+ #
15
+ class Slug
16
+ # A humane representation of the handle ('that-one-time-at_foo')
17
+ attr_reader :handle
18
+ # The purportedly unique string ('')
19
+ attr_accessor :uniqish
20
+
21
+ def initialize handle
22
+ self.handle = handle
23
+ self.uniqish = handle
24
+ end
25
+
26
+ #
27
+ # Unless overridden, use the uniqish to
28
+ # make a name-based UUID within the infochimps.org
29
+ # namespace
30
+ #
31
+ def uuid
32
+ UUID.sha1_create(UUID_URL_NAMESPACE, full_handle)
33
+ end
34
+
35
+ # Handle with only \w characters -- safe for everything there be
36
+ def url_sane
37
+ return '' if !handle
38
+ handle.gsub(/[^\w\/\:]+/, '-').gsub(/_/, '__').gsub(%r{[/:]+}, '_')
39
+ end
40
+
41
+ def handle= t
42
+ @handle = self.class.sanitize_handle(t)
43
+ end
44
+
45
+ # Strip all but handle-safe characters
46
+ def self.sanitize_handle t, turd='-'
47
+ t = t.gsub(%r{[^\w\-\./]+}, turd)
48
+ end
49
+ end
50
+
51
+ #
52
+ # Uses a URL (that's locator, not URI) as a
53
+ # presumed-uniq identifier.
54
+ #
55
+ # +uniqish+ returns the full normalized URL
56
+ #
57
+ # +handle+ is formed from the dot-reversed host, the scheme (if not http) and a
58
+ # sanitized version of the path. (The query string, fragment, etc are stripped
59
+ # from the handle)
60
+ #
61
+ #
62
+ class URLSlug < Slug
63
+ attr_accessor :url
64
+ def initialize url_str
65
+ self.url = Addressable::URI.heuristic_parse(url_str).normalize
66
+ raise "Bad URL #{url}" unless url.host
67
+ self.uniqish = url.to_s
68
+ self.handle = munge_url
69
+ end
70
+
71
+ def uuid
72
+ UUID.sha1_create(UUID_URL_NAMESPACE, full_handle)
73
+ end
74
+ end
75
+ end
76
+
77
+
78
+
79
+ module Sluggable
80
+ protected
81
+ def create_slug
82
+ "Slugging #{self.attributes}"
83
+ if (self.class.slug_on == :url) || (self.name.blank?)
84
+ slug = IMW::URLSlug.new(self.url)
85
+ self.name = slug.handle
86
+ else
87
+ slug = IMW::Slug.new(self.name)
88
+ end
89
+ self.handle ||= slug.handle
90
+ end
91
+ public
92
+
93
+ def self.included base
94
+ base.before :save, :create_slug
95
+ base.class_eval do
96
+ def self.slug_on s=nil
97
+ @slug_on ||= s
98
+ end
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,73 @@
1
+ #
2
+ # h2. lib/imw/dataset/stats.rb -- statistics for datasets
3
+ #
4
+ # == About
5
+ #
6
+ # Implements methods to calculate very basic statistical properties of
7
+ # a dataset.
8
+ #
9
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
10
+ # Copyright:: Copyright (c) 2008 infochimps.org
11
+ # License:: GPL 3.0
12
+ # Website:: http://infinitemonkeywrench.org/
13
+ #
14
+ # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
15
+
16
+
17
+ module IMW
18
+ class Dataset
19
+ #
20
+ # simple histogram
21
+ #
22
+ # Runs down one column/attribute of a dataset
23
+ # returning counts for that column
24
+ #
25
+ def hist slicer
26
+ counts = { }
27
+ els = slice(slicer)
28
+ els.each do |el|
29
+ counts[el] ||= 0
30
+ counts[el] += 1
31
+ end
32
+ self.class.new(counts.map{ |el,ct| [ct,el] })
33
+ end
34
+
35
+ def slice slicer
36
+ case
37
+ when slicer.respond_to?(:call) then self.map{ |row| slicer.call(row) }
38
+ else
39
+ self.map{ |row| row[slicer] }
40
+ end
41
+ end
42
+
43
+ #
44
+ # Report
45
+ #
46
+ def report slicer, opts={}
47
+ opts.reverse_merge! :n_top => 20, :hist_args => [], :fmt => "%7d\t%s", :do_hist => true
48
+ counts = hist(slicer)
49
+ report_hist data, counts, slicer, opts if opts[:do_hist]
50
+ report_sizes data, counts, slicer, opts
51
+ end
52
+
53
+ def report_sizes data, counts, slicer, opts={}
54
+ fmt = opts[:fmt]
55
+ puts fmt % [counts.length, "unique elements"]
56
+ puts fmt % [data.length, "total elements"]
57
+ puts fmt % [counts.find_all(&:nil?).length, "nil elements"]
58
+ uniqvals = counts.map{|ct,el| el}.reject(&:nil?)
59
+ puts " min:\t#{uniqvals.min}"
60
+ puts " max:\t#{uniqvals.max}"
61
+ end
62
+
63
+ # Most popular
64
+ def report_hist data, counts, slicer, opts={}
65
+ top = counts.sort_by{|ct,el| ct}[-opts[:n_top]..-1]
66
+ puts "Top #{opts[:n_top]} elements for slice through #{slicer}:"
67
+ puts " -freq-\t-element-"
68
+ puts top.map{ |ct,el| opts[:fmt] % [ct,el] }
69
+ puts "-------\t-------"
70
+ end
71
+
72
+ end
73
+ end
@@ -0,0 +1,23 @@
1
+ module IMW
2
+ class RecordCounter < Hash
3
+ def record val
4
+ self[val] ||= 0
5
+ self[val] += 1
6
+ end
7
+
8
+ def if_seen val, &block
9
+ if self[val]
10
+ yield
11
+ end
12
+ record val
13
+ end
14
+
15
+ def unless_seen val, &block
16
+ unless self[val]
17
+ yield
18
+ end
19
+ record val
20
+ end
21
+
22
+ end
23
+ end
@@ -0,0 +1,38 @@
1
+ #
2
+ # h2. lib/imw/workflow/task.rb --
3
+ #
4
+ # == About
5
+ #
6
+ # This file defines a class <tt>IMW::Task</tt> which subclasses
7
+ # <tt>Rake::Task</tt>. Tasks defined in IMW should be instances of
8
+ # <tt>IMW::Task</tt>.
9
+ #
10
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
11
+ # Copyright:: Copyright (c) 2008 infochimps.org
12
+ # License:: GPL 3.0
13
+ # Website:: http://infinitemonkeywrench.org/
14
+ #
15
+ # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
16
+
17
+ require 'rake'
18
+
19
+ module IMW
20
+
21
+ class Task < Rake::Task
22
+ end
23
+
24
+ class Dataset
25
+ include Rake::TaskManager
26
+
27
+ # Return a new (or existing) <tt>IMW::Task</tt> with the given
28
+ # +name+. Dependencies can be declared and a block passed in just
29
+ # as in Rake.
30
+ def task name, &block
31
+ self.define_task IMW::Task, name, &block
32
+ end
33
+
34
+ end
35
+ end
36
+
37
+
38
+
@@ -0,0 +1,81 @@
1
+ #
2
+ # lib/imw/workflow.rb -- implements the workflow class
3
+ #
4
+ # == About
5
+ #
6
+ # This file implements the <tt>IMW::Workflow</tt> class which tailors
7
+ # the functionality of Rake for IMW objects.
8
+ #
9
+ # Author:: Philip flip Kromer for infochimps.org (mailto:coders@infochimps.org)
10
+ # Copyright:: Copyright (c) 2008 infochimps.org
11
+ # License:: GPL 3.0
12
+ # Website:: http://infinitemonkeywrench.org/
13
+ #
14
+
15
+ require 'imw/dataset/scaffold'
16
+ require 'imw/dataset/task'
17
+
18
+ module IMW
19
+
20
+ # The <tt>IMW::Workflow</tt> module is a collection of methods which
21
+ # define Rake[http://rake.rubyforge.org/] tasks specialized for each
22
+ # dataset.
23
+ module Workflow
24
+
25
+ # The functions called here define the default tasks associated
26
+ # with each dataset.
27
+ def create_default_tasks
28
+ create_directories_task
29
+ create_symlinks_task
30
+ create_initialize_task
31
+ create_delete_data_task
32
+ create_destroy_task
33
+ create_workflow_tasks
34
+ end
35
+
36
+ # Sets the default tasks in this workflow.
37
+ #
38
+ # The default tasks constitute a set of consecutive actions that
39
+ # must be taken in order: <tt>:rip</tt>, <tt>parse</tt>,
40
+ # <tt>munge</tt>, <tt>fix</tt>, and <tt>package</tt>. Each task
41
+ # is a <tt>Rake::Task</tt> which depends on the one before it.
42
+ #
43
+ # Each task does nothing by default other than create directories
44
+ # to hold files for this dataset as it undergoes the workflow.
45
+ def set_default_tasks
46
+ define_task(Rake::Task, {:rip => []})
47
+ define_task(Rake::Task, {:parse => :rip})
48
+ define_task(Rake::Task, {:munge => :parse})
49
+ define_task(Rake::Task, {:fix => :munge})
50
+ define_task(Rake::Task, {:package => :fix})
51
+ comment_default_tasks
52
+ end
53
+
54
+ # Set the initial comments for each of the default tasks.
55
+ def comment_default_tasks
56
+ self[:rip].comment = "Rip dataset from an origin"
57
+ self[:parse].comment = "Parse dataset into intermediate form"
58
+ self[:munge].comment = "Munge dataset's structure into desired form"
59
+ self[:fix].comment = "Fix and format dataset"
60
+ self[:package].comment = "Package dataset into a final format"
61
+ end
62
+
63
+ # Creates the task dependency chain <tt>:package => :fix => :munge
64
+ # => :peel => :rip => :initialize</tt>.
65
+ def create_workflow_tasks
66
+ @last_description = "Obtain data from some source."
67
+ define_task(IMW::Task, :rip => [:initialize])
68
+ @last_description = "Extract datafiles from ripped data."
69
+ define_task(IMW::Task, :peel => [:rip])
70
+ @last_description = "Transform records in a dataset."
71
+ define_task(IMW::Task, :munge => [:peel])
72
+ @last_description = "Reconcile records."
73
+ define_task(IMW::Task, :fix => [:munge])
74
+ @last_description = "Package dataset in final form."
75
+ define_task(IMW::Task, :package => [:fix])
76
+ end
77
+
78
+ end
79
+ end
80
+
81
+ # puts "#{File.basename(__FILE__)}: You find your flow next to a tall tree. Ahhhh."
@@ -0,0 +1,110 @@
1
+ #
2
+ # h2. lib/imw/files.rb -- uniform interface to various files
3
+ #
4
+ # == About
5
+ #
6
+ # Implements <tt>IMW.open</tt> which returns an appropriate +IMW+
7
+ # object given a URI.
8
+ #
9
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
10
+ # Copyright:: Copyright (c) 2008 infochimps.org
11
+ # License:: GPL 3.0
12
+ # Website:: http://infinitemonkeywrench.org/
13
+ #
14
+ # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
15
+
16
+ require 'uri'
17
+ require 'open-uri'
18
+ require 'imw/utils'
19
+ require 'imw/files/basicfile'
20
+ require 'imw/files/archive'
21
+ require 'imw/files/compressible'
22
+ require 'imw/files/compressed_file'
23
+
24
+ module IMW
25
+
26
+ # Parse +path+ and return an appropriate handler. Pass in <tt>:write
27
+ # => true</tt> to open for writing.
28
+ #
29
+ # IMW.open("/tmp/test.csv") # => IMW::Files::Csv("/tmp/test.csv')
30
+ #
31
+ #
32
+ def self.open path, options = {}
33
+ mode = options[:write] ? 'w' : 'r'
34
+ Files.file_class_for(path, options).new(path, mode, options)
35
+ end
36
+
37
+ def self.open! path, options = {}
38
+ self.open path, options.reverse_merge(:write => true)
39
+ end
40
+
41
+ module Files
42
+
43
+
44
+ # There is certainly a cleaner way to do this.
45
+ autoload :Text, 'imw/files/text'
46
+ autoload :Binary, 'imw/files/binary'
47
+ autoload :Yaml, 'imw/files/yaml'
48
+ autoload :Csv, 'imw/files/csv'
49
+ autoload :Json, 'imw/files/json'
50
+ autoload :Bz2, 'imw/files/compressed_files_and_archives'
51
+ autoload :Gz, 'imw/files/compressed_files_and_archives'
52
+ autoload :Tar, 'imw/files/compressed_files_and_archives'
53
+ autoload :TarBz2, 'imw/files/compressed_files_and_archives'
54
+ autoload :TarGz, 'imw/files/compressed_files_and_archives'
55
+ autoload :Rar, 'imw/files/compressed_files_and_archives'
56
+ autoload :Zip, 'imw/files/compressed_files_and_archives'
57
+ autoload :Xml, 'imw/files/sgml'
58
+ autoload :Html, 'imw/files/sgml'
59
+
60
+
61
+ # An array used to match files to classes to handle them. The
62
+ # first element of each array is the regexp and the second names
63
+ # the class to handle the file.
64
+ #
65
+ # IMW::Files::EXTENSION_HANDLERS << [ /\.csv$/, :csv ] #=> IMW::Files::Csv
66
+ # IMW::Files::EXTENSION_HANDLERS << [ /\.txt$/, "Text" ] #=> IMW::Files::Text
67
+ # IMW::Files::EXTENSION_HANDLERS << [ /\.myclass%/, MyClass ] #=> MyClass
68
+ #
69
+ # Elements at the end of the array have greater precedence which
70
+ # allows, say, <tt>.tar.gz</tt> to be handled differently from
71
+ # <tt>.gz</tt>.
72
+ EXTENSION_HANDLERS = [
73
+ [/./, :Text], # catchall
74
+ [/\.txt$/, :Text],
75
+ [/\.txt$/, :Text],
76
+ [/\.dat$/, :Text],
77
+ [/\.ascii$/, :Text],
78
+ [/\.yaml$/, :Yaml],
79
+ [/\.yml$/, :Yaml],
80
+ [/\.csv$/, :Csv],
81
+ [/\.tsv$/, :Tsv],
82
+ [/\.json$/, :Json],
83
+ [/\.bz2$/, :Bz2],
84
+ [/\.gz$/, :Gz],
85
+ [/\.tar\.bz2$/, :TarBz2],
86
+ [/\.tbz2$/, :TarBz2],
87
+ [/\.tar\.gz$/, :TarGz],
88
+ [/\.tgz$/, :TarGz],
89
+ [/\.tar$/, :Tar],
90
+ [/\.rar$/, :Rar],
91
+ [/\.zip$/, :Zip],
92
+ [/\.xml$/, :Xml],
93
+ [/\.html$/, :Html],
94
+ [/\.htm$/, :Html]
95
+ ]
96
+
97
+ protected
98
+ def self.file_class_for path, options = {}
99
+ klass = options.delete(:as)
100
+ unless klass
101
+ EXTENSION_HANDLERS.reverse_each do |regexp, thing| # end has greater precedence
102
+ next unless regexp =~ path
103
+ klass = thing
104
+ break
105
+ end
106
+ end
107
+ klass.is_a?(Class) ? klass : class_eval(klass.to_s)
108
+ end
109
+ end
110
+ end