imw 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (111) hide show
  1. data/.gitignore +15 -0
  2. data/CHANGELOG +0 -0
  3. data/LICENSE +674 -0
  4. data/README.rdoc +101 -0
  5. data/Rakefile +20 -0
  6. data/VERSION +1 -0
  7. data/etc/imwrc.rb +76 -0
  8. data/lib/imw.rb +42 -0
  9. data/lib/imw/boot.rb +58 -0
  10. data/lib/imw/dataset.rb +233 -0
  11. data/lib/imw/dataset/datamapper.rb +66 -0
  12. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
  13. data/lib/imw/dataset/loaddump.rb +50 -0
  14. data/lib/imw/dataset/old/file_collection.rb +88 -0
  15. data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
  16. data/lib/imw/dataset/scaffold.rb +132 -0
  17. data/lib/imw/dataset/scraped_uri.rb +305 -0
  18. data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
  19. data/lib/imw/dataset/scrub/scrub.rb +147 -0
  20. data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
  21. data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
  22. data/lib/imw/dataset/scrub/slug.rb +101 -0
  23. data/lib/imw/dataset/stats.rb +73 -0
  24. data/lib/imw/dataset/stats/counter.rb +23 -0
  25. data/lib/imw/dataset/task.rb +38 -0
  26. data/lib/imw/dataset/workflow.rb +81 -0
  27. data/lib/imw/files.rb +110 -0
  28. data/lib/imw/files/archive.rb +113 -0
  29. data/lib/imw/files/basicfile.rb +122 -0
  30. data/lib/imw/files/binary.rb +28 -0
  31. data/lib/imw/files/compressed_file.rb +93 -0
  32. data/lib/imw/files/compressed_files_and_archives.rb +348 -0
  33. data/lib/imw/files/compressible.rb +103 -0
  34. data/lib/imw/files/csv.rb +112 -0
  35. data/lib/imw/files/json.rb +41 -0
  36. data/lib/imw/files/sgml.rb +65 -0
  37. data/lib/imw/files/text.rb +68 -0
  38. data/lib/imw/files/yaml.rb +46 -0
  39. data/lib/imw/packagers.rb +8 -0
  40. data/lib/imw/packagers/archiver.rb +108 -0
  41. data/lib/imw/packagers/s3_mover.rb +28 -0
  42. data/lib/imw/parsers.rb +7 -0
  43. data/lib/imw/parsers/html_parser.rb +382 -0
  44. data/lib/imw/parsers/html_parser/matchers.rb +306 -0
  45. data/lib/imw/parsers/line_parser.rb +87 -0
  46. data/lib/imw/parsers/regexp_parser.rb +72 -0
  47. data/lib/imw/utils.rb +24 -0
  48. data/lib/imw/utils/components.rb +61 -0
  49. data/lib/imw/utils/config.rb +46 -0
  50. data/lib/imw/utils/error.rb +54 -0
  51. data/lib/imw/utils/extensions/array.rb +125 -0
  52. data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
  53. data/lib/imw/utils/extensions/core.rb +43 -0
  54. data/lib/imw/utils/extensions/dir.rb +24 -0
  55. data/lib/imw/utils/extensions/file_core.rb +64 -0
  56. data/lib/imw/utils/extensions/hash.rb +218 -0
  57. data/lib/imw/utils/extensions/hpricot.rb +48 -0
  58. data/lib/imw/utils/extensions/string.rb +49 -0
  59. data/lib/imw/utils/extensions/struct.rb +42 -0
  60. data/lib/imw/utils/extensions/symbol.rb +28 -0
  61. data/lib/imw/utils/extensions/typed_struct.rb +22 -0
  62. data/lib/imw/utils/extensions/uri.rb +59 -0
  63. data/lib/imw/utils/log.rb +67 -0
  64. data/lib/imw/utils/misc.rb +63 -0
  65. data/lib/imw/utils/paths.rb +115 -0
  66. data/lib/imw/utils/uri.rb +59 -0
  67. data/lib/imw/utils/uuid.rb +33 -0
  68. data/lib/imw/utils/validate.rb +38 -0
  69. data/lib/imw/utils/version.rb +12 -0
  70. data/lib/imw/utils/view.rb +113 -0
  71. data/lib/imw/utils/view/dump_csv.rb +112 -0
  72. data/lib/imw/utils/view/dump_csv_older.rb +117 -0
  73. data/spec/data/sample.csv +131 -0
  74. data/spec/data/sample.tsv +131 -0
  75. data/spec/data/sample.txt +131 -0
  76. data/spec/data/sample.xml +653 -0
  77. data/spec/data/sample.yaml +652 -0
  78. data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
  79. data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
  80. data/spec/imw/files/archive_spec.rb +118 -0
  81. data/spec/imw/files/basicfile_spec.rb +121 -0
  82. data/spec/imw/files/bz2_spec.rb +32 -0
  83. data/spec/imw/files/compressed_file_spec.rb +96 -0
  84. data/spec/imw/files/compressible_spec.rb +100 -0
  85. data/spec/imw/files/file_spec.rb +144 -0
  86. data/spec/imw/files/gz_spec.rb +32 -0
  87. data/spec/imw/files/rar_spec.rb +33 -0
  88. data/spec/imw/files/tar_spec.rb +31 -0
  89. data/spec/imw/files/text_spec.rb +23 -0
  90. data/spec/imw/files/zip_spec.rb +31 -0
  91. data/spec/imw/files_spec.rb +38 -0
  92. data/spec/imw/packagers/archiver_spec.rb +125 -0
  93. data/spec/imw/packagers/s3_mover_spec.rb +7 -0
  94. data/spec/imw/parsers/line_parser_spec.rb +96 -0
  95. data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
  96. data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
  97. data/spec/imw/utils/extensions/find_spec.rb +113 -0
  98. data/spec/imw/utils/paths_spec.rb +38 -0
  99. data/spec/imw/workflow/rip/local_spec.rb +89 -0
  100. data/spec/imw/workflow/rip_spec.rb +27 -0
  101. data/spec/rcov.opts +1 -0
  102. data/spec/spec.opts +4 -0
  103. data/spec/spec_helper.rb +32 -0
  104. data/spec/support/archive_contents_matcher.rb +94 -0
  105. data/spec/support/custom_matchers.rb +21 -0
  106. data/spec/support/directory_contents_matcher.rb +61 -0
  107. data/spec/support/extensions.rb +18 -0
  108. data/spec/support/file_contents_matcher.rb +50 -0
  109. data/spec/support/random.rb +210 -0
  110. data/spec/support/without_regard_to_order_matcher.rb +58 -0
  111. metadata +196 -0
@@ -0,0 +1,38 @@
1
+
2
+
3
+ module IMW
4
+ module URIScrubber
5
+
6
+ def scrubbed
7
+ to_dirpath
8
+ end
9
+ end
10
+ end
11
+
12
+ module Scrub
13
+ #
14
+ # start with a letter, and contain only A-Za-z0-9_
15
+ #
16
+ class SimplifiedURL < Scrub::Generic
17
+ self.complaint = "should follow our zany simplified URL rules: com.host.dot-reversed:schemeifnothttp/path/seg_men-ts/stuff.ext-SHA1ifweird"
18
+ self.validator = %r{#{Addressable::URI::SAFE_CHARS}#{Addressable::URI::RESERVED_CHARS}}u
19
+ self.replacer = ''
20
+ include Scrub::Lowercased
21
+ attr_accessor :uri
22
+
23
+ def valid? str
24
+ str.to_s.downcase == sanitize(str)
25
+ end
26
+
27
+ def sanitize str
28
+ # if this fails just normalize once, or don't set $KCODE: http://bit.ly/1664vp
29
+ uri = Addressable::URI.heuristic_parse(str.to_s).normalize
30
+ # print [uri.host, uri.host_valid?, uri.path, uri.path_valid?].inspect
31
+ if uri.host_valid?
32
+ uri.scrubbed
33
+ else
34
+ uri.uuid_path
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+ require 'scrub'
4
+ require 'scrub_simple_url'
5
+
6
+ test_strings = [
7
+ nil, '', '12', '123', 'simple', 'UPPER', 'CamelCased', 'iden_tifier_23_',
8
+ 'twentyfouralphacharslong', 'twentyfiveatozonlyletters', 'hello.-_there@funnychar.com',
9
+ "tab\t", "newline\n",
10
+ "Iñtërnâtiônàlizætiøn",
11
+ 'semicolon;', 'quote"', 'tick\'', 'backtick`', 'percent%', 'plus+', 'space ',
12
+ 'leftanglebracket<', 'ampersand&',
13
+ "control char-bel\x07",
14
+ "http://foo.bar.com/",
15
+ "HTTP://FOO.BAR.com",
16
+ ".com/zazz",
17
+ "scheme://user_name@user_acct:passwd@host-name.museum:9047/path;pathquery/p!a-th~2/path?query=param&amp;query=pa%20ram#fragment",
18
+ "http://web.site.com/path/path/file.ext",
19
+ "ftp://ftp.site.com/path/path/file.ext",
20
+ "/absolute/pathname/file.ext",
21
+ "http://foo.bar.com/.hidden_file_with.ext",
22
+ "http://foo.bar.com/.hidden_file",
23
+ "dir/--/non_alpha_path_segment.ext",
24
+ "http://foo.bar.com/dir/../two_dots_in_path",
25
+
26
+ ]
27
+
28
+
29
+ scrubbers = {
30
+ # :unicode_title => Scrub::UnicodeTitle.new,
31
+ # :title => Scrub::Title.new,
32
+ # :identifier => Scrub::Identifier.new,
33
+ # :free_text => Scrub::FreeText.new,
34
+ :handle => Scrub::Handle.new,
35
+ :simplified_url => Scrub::SimplifiedURL.new,
36
+ # :domain => Scrub::Domain.new,
37
+ # :email => Scrub::Email.new,
38
+ }
39
+
40
+ scrubbers.each do |scrubber_name, scrubber|
41
+ puts scrubber_name
42
+ results = test_strings.map do |test_string|
43
+ [!!scrubber.valid?(test_string), scrubber.sanitize(test_string).inspect, test_string.inspect ]
44
+ end
45
+ results.sort_by{|val,san,orig| val ? 1 : -1 }.each do |val,san,orig|
46
+ puts " %-5s %-30s %-30s" % [val,san,orig]
47
+ end
48
+ end
49
+
50
+
51
+
52
+ # 'foo@bar.com', 'foo@newskool-tld.museum', 'foo@twoletter-tld.de', 'foo@nonexistant-tld.qq',
53
+ # 'r@a.wk', '1234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890@gmail.com',
54
+ # 'hello.-_there@funnychar.com', 'uucp%addr@gmail.com', 'hello+routing-str@gmail.com',
55
+ # 'domain@can.haz.many.sub.doma.in',],
56
+ # :invalid => [nil, '', '!!@nobadchars.com', 'foo@no-rep-dots..com', 'foo@badtld.xxx', 'foo@toolongtld.abcdefg',
57
+ # 'Iñtërnâtiônàlizætiøn@hasnt.happened.to.email', 'need.domain.and.tld@de', "tab\t", "newline\n",
58
+ # 'r@.wk', '1234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890@gmail2.com',
59
+ # # these are technically allowed but not seen in practice:
60
+ # 'uucp!addr@gmail.com', 'semicolon;@gmail.com', 'quote"@gmail.com', 'tick\'@gmail.com', 'backtick`@gmail.com', 'space @gmail.com', 'bracket<@gmail.com', 'bracket>@gmail.com'
@@ -0,0 +1,101 @@
1
+ require 'rubygems'
2
+ require 'addressable/uri'
3
+ require 'uuidtools'
4
+ require 'scrub'
5
+ require 'scrub_simple_url'
6
+
7
+ module IMW
8
+
9
+ #
10
+ #
11
+ # +handle+ -- reasonable effort at a uniq-ish, but human-comprehensible string
12
+ # Handle should only contain the characters A-Za-z0-9_-./
13
+ #
14
+ #
15
+ class Slug
16
+ # A humane representation of the handle ('that-one-time-at_foo')
17
+ attr_reader :handle
18
+ # The purportedly unique string ('')
19
+ attr_accessor :uniqish
20
+
21
+ def initialize handle
22
+ self.handle = handle
23
+ self.uniqish = handle
24
+ end
25
+
26
+ #
27
+ # Unless overridden, use the uniqish to
28
+ # make a name-based UUID within the infochimps.org
29
+ # namespace
30
+ #
31
+ def uuid
32
+ UUID.sha1_create(UUID_URL_NAMESPACE, full_handle)
33
+ end
34
+
35
+ # Handle with only \w characters -- safe for everything there be
36
+ def url_sane
37
+ return '' if !handle
38
+ handle.gsub(/[^\w\/\:]+/, '-').gsub(/_/, '__').gsub(%r{[/:]+}, '_')
39
+ end
40
+
41
+ def handle= t
42
+ @handle = self.class.sanitize_handle(t)
43
+ end
44
+
45
+ # Strip all but handle-safe characters
46
+ def self.sanitize_handle t, turd='-'
47
+ t = t.gsub(%r{[^\w\-\./]+}, turd)
48
+ end
49
+ end
50
+
51
+ #
52
+ # Uses a URL (that's locator, not URI) as a
53
+ # presumed-uniq identifier.
54
+ #
55
+ # +uniqish+ returns the full normalized URL
56
+ #
57
+ # +handle+ is formed from the dot-reversed host, the scheme (if not http) and a
58
+ # sanitized version of the path. (The query string, fragment, etc are stripped
59
+ # from the handle)
60
+ #
61
+ #
62
+ class URLSlug < Slug
63
+ attr_accessor :url
64
+ def initialize url_str
65
+ self.url = Addressable::URI.heuristic_parse(url_str).normalize
66
+ raise "Bad URL #{url}" unless url.host
67
+ self.uniqish = url.to_s
68
+ self.handle = munge_url
69
+ end
70
+
71
+ def uuid
72
+ UUID.sha1_create(UUID_URL_NAMESPACE, full_handle)
73
+ end
74
+ end
75
+ end
76
+
77
+
78
+
79
+ module Sluggable
80
+ protected
81
+ def create_slug
82
+ "Slugging #{self.attributes}"
83
+ if (self.class.slug_on == :url) || (self.name.blank?)
84
+ slug = IMW::URLSlug.new(self.url)
85
+ self.name = slug.handle
86
+ else
87
+ slug = IMW::Slug.new(self.name)
88
+ end
89
+ self.handle ||= slug.handle
90
+ end
91
+ public
92
+
93
+ def self.included base
94
+ base.before :save, :create_slug
95
+ base.class_eval do
96
+ def self.slug_on s=nil
97
+ @slug_on ||= s
98
+ end
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,73 @@
1
+ #
2
+ # h2. lib/imw/dataset/stats.rb -- statistics for datasets
3
+ #
4
+ # == About
5
+ #
6
+ # Implements methods to calculate very basic statistical properties of
7
+ # a dataset.
8
+ #
9
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
10
+ # Copyright:: Copyright (c) 2008 infochimps.org
11
+ # License:: GPL 3.0
12
+ # Website:: http://infinitemonkeywrench.org/
13
+ #
14
+ # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
15
+
16
+
17
+ module IMW
18
+ class Dataset
19
+ #
20
+ # simple histogram
21
+ #
22
+ # Runs down one column/attribute of a dataset
23
+ # returning counts for that column
24
+ #
25
+ def hist slicer
26
+ counts = { }
27
+ els = slice(slicer)
28
+ els.each do |el|
29
+ counts[el] ||= 0
30
+ counts[el] += 1
31
+ end
32
+ self.class.new(counts.map{ |el,ct| [ct,el] })
33
+ end
34
+
35
+ def slice slicer
36
+ case
37
+ when slicer.respond_to?(:call) then self.map{ |row| slicer.call(row) }
38
+ else
39
+ self.map{ |row| row[slicer] }
40
+ end
41
+ end
42
+
43
+ #
44
+ # Report
45
+ #
46
+ def report slicer, opts={}
47
+ opts.reverse_merge! :n_top => 20, :hist_args => [], :fmt => "%7d\t%s", :do_hist => true
48
+ counts = hist(slicer)
49
+ report_hist data, counts, slicer, opts if opts[:do_hist]
50
+ report_sizes data, counts, slicer, opts
51
+ end
52
+
53
+ def report_sizes data, counts, slicer, opts={}
54
+ fmt = opts[:fmt]
55
+ puts fmt % [counts.length, "unique elements"]
56
+ puts fmt % [data.length, "total elements"]
57
+ puts fmt % [counts.find_all(&:nil?).length, "nil elements"]
58
+ uniqvals = counts.map{|ct,el| el}.reject(&:nil?)
59
+ puts " min:\t#{uniqvals.min}"
60
+ puts " max:\t#{uniqvals.max}"
61
+ end
62
+
63
+ # Most popular
64
+ def report_hist data, counts, slicer, opts={}
65
+ top = counts.sort_by{|ct,el| ct}[-opts[:n_top]..-1]
66
+ puts "Top #{opts[:n_top]} elements for slice through #{slicer}:"
67
+ puts " -freq-\t-element-"
68
+ puts top.map{ |ct,el| opts[:fmt] % [ct,el] }
69
+ puts "-------\t-------"
70
+ end
71
+
72
+ end
73
+ end
@@ -0,0 +1,23 @@
1
+ module IMW
2
+ class RecordCounter < Hash
3
+ def record val
4
+ self[val] ||= 0
5
+ self[val] += 1
6
+ end
7
+
8
+ def if_seen val, &block
9
+ if self[val]
10
+ yield
11
+ end
12
+ record val
13
+ end
14
+
15
+ def unless_seen val, &block
16
+ unless self[val]
17
+ yield
18
+ end
19
+ record val
20
+ end
21
+
22
+ end
23
+ end
@@ -0,0 +1,38 @@
1
+ #
2
+ # h2. lib/imw/workflow/task.rb --
3
+ #
4
+ # == About
5
+ #
6
+ # This file defines a class <tt>IMW::Task</tt> which subclasses
7
+ # <tt>Rake::Task</tt>. Tasks defined in IMW should be instances of
8
+ # <tt>IMW::Task</tt>.
9
+ #
10
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
11
+ # Copyright:: Copyright (c) 2008 infochimps.org
12
+ # License:: GPL 3.0
13
+ # Website:: http://infinitemonkeywrench.org/
14
+ #
15
+ # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
16
+
17
+ require 'rake'
18
+
19
+ module IMW
20
+
21
+ class Task < Rake::Task
22
+ end
23
+
24
+ class Dataset
25
+ include Rake::TaskManager
26
+
27
+ # Return a new (or existing) <tt>IMW::Task</tt> with the given
28
+ # +name+. Dependencies can be declared and a block passed in just
29
+ # as in Rake.
30
+ def task name, &block
31
+ self.define_task IMW::Task, name, &block
32
+ end
33
+
34
+ end
35
+ end
36
+
37
+
38
+
@@ -0,0 +1,81 @@
1
+ #
2
+ # lib/imw/workflow.rb -- implements the workflow class
3
+ #
4
+ # == About
5
+ #
6
+ # This file implements the <tt>IMW::Workflow</tt> class which tailors
7
+ # the functionality of Rake for IMW objects.
8
+ #
9
+ # Author:: Philip flip Kromer for infochimps.org (mailto:coders@infochimps.org)
10
+ # Copyright:: Copyright (c) 2008 infochimps.org
11
+ # License:: GPL 3.0
12
+ # Website:: http://infinitemonkeywrench.org/
13
+ #
14
+
15
+ require 'imw/dataset/scaffold'
16
+ require 'imw/dataset/task'
17
+
18
+ module IMW
19
+
20
+ # The <tt>IMW::Workflow</tt> module is a collection of methods which
21
+ # define Rake[http://rake.rubyforge.org/] tasks specialized for each
22
+ # dataset.
23
+ module Workflow
24
+
25
+ # The functions called here define the default tasks associated
26
+ # with each dataset.
27
+ def create_default_tasks
28
+ create_directories_task
29
+ create_symlinks_task
30
+ create_initialize_task
31
+ create_delete_data_task
32
+ create_destroy_task
33
+ create_workflow_tasks
34
+ end
35
+
36
+ # Sets the default tasks in this workflow.
37
+ #
38
+ # The default tasks constitute a set of consecutive actions that
39
+ # must be taken in order: <tt>:rip</tt>, <tt>parse</tt>,
40
+ # <tt>munge</tt>, <tt>fix</tt>, and <tt>package</tt>. Each task
41
+ # is a <tt>Rake::Task</tt> which depends on the one before it.
42
+ #
43
+ # Each task does nothing by default other than create directories
44
+ # to hold files for this dataset as it undergoes the workflow.
45
+ def set_default_tasks
46
+ define_task(Rake::Task, {:rip => []})
47
+ define_task(Rake::Task, {:parse => :rip})
48
+ define_task(Rake::Task, {:munge => :parse})
49
+ define_task(Rake::Task, {:fix => :munge})
50
+ define_task(Rake::Task, {:package => :fix})
51
+ comment_default_tasks
52
+ end
53
+
54
+ # Set the initial comments for each of the default tasks.
55
+ def comment_default_tasks
56
+ self[:rip].comment = "Rip dataset from an origin"
57
+ self[:parse].comment = "Parse dataset into intermediate form"
58
+ self[:munge].comment = "Munge dataset's structure into desired form"
59
+ self[:fix].comment = "Fix and format dataset"
60
+ self[:package].comment = "Package dataset into a final format"
61
+ end
62
+
63
+ # Creates the task dependency chain <tt>:package => :fix => :munge
64
+ # => :peel => :rip => :initialize</tt>.
65
+ def create_workflow_tasks
66
+ @last_description = "Obtain data from some source."
67
+ define_task(IMW::Task, :rip => [:initialize])
68
+ @last_description = "Extract datafiles from ripped data."
69
+ define_task(IMW::Task, :peel => [:rip])
70
+ @last_description = "Transform records in a dataset."
71
+ define_task(IMW::Task, :munge => [:peel])
72
+ @last_description = "Reconcile records."
73
+ define_task(IMW::Task, :fix => [:munge])
74
+ @last_description = "Package dataset in final form."
75
+ define_task(IMW::Task, :package => [:fix])
76
+ end
77
+
78
+ end
79
+ end
80
+
81
+ # puts "#{File.basename(__FILE__)}: You find your flow next to a tall tree. Ahhhh."
@@ -0,0 +1,110 @@
1
+ #
2
+ # h2. lib/imw/files.rb -- uniform interface to various files
3
+ #
4
+ # == About
5
+ #
6
+ # Implements <tt>IMW.open</tt> which returns an appropriate +IMW+
7
+ # object given a URI.
8
+ #
9
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
10
+ # Copyright:: Copyright (c) 2008 infochimps.org
11
+ # License:: GPL 3.0
12
+ # Website:: http://infinitemonkeywrench.org/
13
+ #
14
+ # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
15
+
16
+ require 'uri'
17
+ require 'open-uri'
18
+ require 'imw/utils'
19
+ require 'imw/files/basicfile'
20
+ require 'imw/files/archive'
21
+ require 'imw/files/compressible'
22
+ require 'imw/files/compressed_file'
23
+
24
+ module IMW
25
+
26
+ # Parse +path+ and return an appropriate handler. Pass in <tt>:write
27
+ # => true</tt> to open for writing.
28
+ #
29
+ # IMW.open("/tmp/test.csv") # => IMW::Files::Csv("/tmp/test.csv')
30
+ #
31
+ #
32
+ def self.open path, options = {}
33
+ mode = options[:write] ? 'w' : 'r'
34
+ Files.file_class_for(path, options).new(path, mode, options)
35
+ end
36
+
37
+ def self.open! path, options = {}
38
+ self.open path, options.reverse_merge(:write => true)
39
+ end
40
+
41
+ module Files
42
+
43
+
44
+ # There is certainly a cleaner way to do this.
45
+ autoload :Text, 'imw/files/text'
46
+ autoload :Binary, 'imw/files/binary'
47
+ autoload :Yaml, 'imw/files/yaml'
48
+ autoload :Csv, 'imw/files/csv'
49
+ autoload :Json, 'imw/files/json'
50
+ autoload :Bz2, 'imw/files/compressed_files_and_archives'
51
+ autoload :Gz, 'imw/files/compressed_files_and_archives'
52
+ autoload :Tar, 'imw/files/compressed_files_and_archives'
53
+ autoload :TarBz2, 'imw/files/compressed_files_and_archives'
54
+ autoload :TarGz, 'imw/files/compressed_files_and_archives'
55
+ autoload :Rar, 'imw/files/compressed_files_and_archives'
56
+ autoload :Zip, 'imw/files/compressed_files_and_archives'
57
+ autoload :Xml, 'imw/files/sgml'
58
+ autoload :Html, 'imw/files/sgml'
59
+
60
+
61
+ # An array used to match files to classes to handle them. The
62
+ # first element of each array is the regexp and the second names
63
+ # the class to handle the file.
64
+ #
65
+ # IMW::Files::EXTENSION_HANDLERS << [ /\.csv$/, :csv ] #=> IMW::Files::Csv
66
+ # IMW::Files::EXTENSION_HANDLERS << [ /\.txt$/, "Text" ] #=> IMW::Files::Text
67
+ # IMW::Files::EXTENSION_HANDLERS << [ /\.myclass%/, MyClass ] #=> MyClass
68
+ #
69
+ # Elements at the end of the array have greater precedence which
70
+ # allows, say, <tt>.tar.gz</tt> to be handled differently from
71
+ # <tt>.gz</tt>.
72
+ EXTENSION_HANDLERS = [
73
+ [/./, :Text], # catchall
74
+ [/\.txt$/, :Text],
75
+ [/\.txt$/, :Text],
76
+ [/\.dat$/, :Text],
77
+ [/\.ascii$/, :Text],
78
+ [/\.yaml$/, :Yaml],
79
+ [/\.yml$/, :Yaml],
80
+ [/\.csv$/, :Csv],
81
+ [/\.tsv$/, :Tsv],
82
+ [/\.json$/, :Json],
83
+ [/\.bz2$/, :Bz2],
84
+ [/\.gz$/, :Gz],
85
+ [/\.tar\.bz2$/, :TarBz2],
86
+ [/\.tbz2$/, :TarBz2],
87
+ [/\.tar\.gz$/, :TarGz],
88
+ [/\.tgz$/, :TarGz],
89
+ [/\.tar$/, :Tar],
90
+ [/\.rar$/, :Rar],
91
+ [/\.zip$/, :Zip],
92
+ [/\.xml$/, :Xml],
93
+ [/\.html$/, :Html],
94
+ [/\.htm$/, :Html]
95
+ ]
96
+
97
+ protected
98
+ def self.file_class_for path, options = {}
99
+ klass = options.delete(:as)
100
+ unless klass
101
+ EXTENSION_HANDLERS.reverse_each do |regexp, thing| # end has greater precedence
102
+ next unless regexp =~ path
103
+ klass = thing
104
+ break
105
+ end
106
+ end
107
+ klass.is_a?(Class) ? klass : class_eval(klass.to_s)
108
+ end
109
+ end
110
+ end