imw 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. data/.gitignore +15 -0
  2. data/CHANGELOG +0 -0
  3. data/LICENSE +674 -0
  4. data/README.rdoc +101 -0
  5. data/Rakefile +20 -0
  6. data/VERSION +1 -0
  7. data/etc/imwrc.rb +76 -0
  8. data/lib/imw.rb +42 -0
  9. data/lib/imw/boot.rb +58 -0
  10. data/lib/imw/dataset.rb +233 -0
  11. data/lib/imw/dataset/datamapper.rb +66 -0
  12. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
  13. data/lib/imw/dataset/loaddump.rb +50 -0
  14. data/lib/imw/dataset/old/file_collection.rb +88 -0
  15. data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
  16. data/lib/imw/dataset/scaffold.rb +132 -0
  17. data/lib/imw/dataset/scraped_uri.rb +305 -0
  18. data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
  19. data/lib/imw/dataset/scrub/scrub.rb +147 -0
  20. data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
  21. data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
  22. data/lib/imw/dataset/scrub/slug.rb +101 -0
  23. data/lib/imw/dataset/stats.rb +73 -0
  24. data/lib/imw/dataset/stats/counter.rb +23 -0
  25. data/lib/imw/dataset/task.rb +38 -0
  26. data/lib/imw/dataset/workflow.rb +81 -0
  27. data/lib/imw/files.rb +110 -0
  28. data/lib/imw/files/archive.rb +113 -0
  29. data/lib/imw/files/basicfile.rb +122 -0
  30. data/lib/imw/files/binary.rb +28 -0
  31. data/lib/imw/files/compressed_file.rb +93 -0
  32. data/lib/imw/files/compressed_files_and_archives.rb +348 -0
  33. data/lib/imw/files/compressible.rb +103 -0
  34. data/lib/imw/files/csv.rb +112 -0
  35. data/lib/imw/files/json.rb +41 -0
  36. data/lib/imw/files/sgml.rb +65 -0
  37. data/lib/imw/files/text.rb +68 -0
  38. data/lib/imw/files/yaml.rb +46 -0
  39. data/lib/imw/packagers.rb +8 -0
  40. data/lib/imw/packagers/archiver.rb +108 -0
  41. data/lib/imw/packagers/s3_mover.rb +28 -0
  42. data/lib/imw/parsers.rb +7 -0
  43. data/lib/imw/parsers/html_parser.rb +382 -0
  44. data/lib/imw/parsers/html_parser/matchers.rb +306 -0
  45. data/lib/imw/parsers/line_parser.rb +87 -0
  46. data/lib/imw/parsers/regexp_parser.rb +72 -0
  47. data/lib/imw/utils.rb +24 -0
  48. data/lib/imw/utils/components.rb +61 -0
  49. data/lib/imw/utils/config.rb +46 -0
  50. data/lib/imw/utils/error.rb +54 -0
  51. data/lib/imw/utils/extensions/array.rb +125 -0
  52. data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
  53. data/lib/imw/utils/extensions/core.rb +43 -0
  54. data/lib/imw/utils/extensions/dir.rb +24 -0
  55. data/lib/imw/utils/extensions/file_core.rb +64 -0
  56. data/lib/imw/utils/extensions/hash.rb +218 -0
  57. data/lib/imw/utils/extensions/hpricot.rb +48 -0
  58. data/lib/imw/utils/extensions/string.rb +49 -0
  59. data/lib/imw/utils/extensions/struct.rb +42 -0
  60. data/lib/imw/utils/extensions/symbol.rb +28 -0
  61. data/lib/imw/utils/extensions/typed_struct.rb +22 -0
  62. data/lib/imw/utils/extensions/uri.rb +59 -0
  63. data/lib/imw/utils/log.rb +67 -0
  64. data/lib/imw/utils/misc.rb +63 -0
  65. data/lib/imw/utils/paths.rb +115 -0
  66. data/lib/imw/utils/uri.rb +59 -0
  67. data/lib/imw/utils/uuid.rb +33 -0
  68. data/lib/imw/utils/validate.rb +38 -0
  69. data/lib/imw/utils/version.rb +12 -0
  70. data/lib/imw/utils/view.rb +113 -0
  71. data/lib/imw/utils/view/dump_csv.rb +112 -0
  72. data/lib/imw/utils/view/dump_csv_older.rb +117 -0
  73. data/spec/data/sample.csv +131 -0
  74. data/spec/data/sample.tsv +131 -0
  75. data/spec/data/sample.txt +131 -0
  76. data/spec/data/sample.xml +653 -0
  77. data/spec/data/sample.yaml +652 -0
  78. data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
  79. data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
  80. data/spec/imw/files/archive_spec.rb +118 -0
  81. data/spec/imw/files/basicfile_spec.rb +121 -0
  82. data/spec/imw/files/bz2_spec.rb +32 -0
  83. data/spec/imw/files/compressed_file_spec.rb +96 -0
  84. data/spec/imw/files/compressible_spec.rb +100 -0
  85. data/spec/imw/files/file_spec.rb +144 -0
  86. data/spec/imw/files/gz_spec.rb +32 -0
  87. data/spec/imw/files/rar_spec.rb +33 -0
  88. data/spec/imw/files/tar_spec.rb +31 -0
  89. data/spec/imw/files/text_spec.rb +23 -0
  90. data/spec/imw/files/zip_spec.rb +31 -0
  91. data/spec/imw/files_spec.rb +38 -0
  92. data/spec/imw/packagers/archiver_spec.rb +125 -0
  93. data/spec/imw/packagers/s3_mover_spec.rb +7 -0
  94. data/spec/imw/parsers/line_parser_spec.rb +96 -0
  95. data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
  96. data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
  97. data/spec/imw/utils/extensions/find_spec.rb +113 -0
  98. data/spec/imw/utils/paths_spec.rb +38 -0
  99. data/spec/imw/workflow/rip/local_spec.rb +89 -0
  100. data/spec/imw/workflow/rip_spec.rb +27 -0
  101. data/spec/rcov.opts +1 -0
  102. data/spec/spec.opts +4 -0
  103. data/spec/spec_helper.rb +32 -0
  104. data/spec/support/archive_contents_matcher.rb +94 -0
  105. data/spec/support/custom_matchers.rb +21 -0
  106. data/spec/support/directory_contents_matcher.rb +61 -0
  107. data/spec/support/extensions.rb +18 -0
  108. data/spec/support/file_contents_matcher.rb +50 -0
  109. data/spec/support/random.rb +210 -0
  110. data/spec/support/without_regard_to_order_matcher.rb +58 -0
  111. metadata +196 -0
@@ -0,0 +1,103 @@
1
+ #
2
+ # h2. lib/imw/files//compressible.rb -- compression module
3
+ #
4
+ # == About
5
+ #
6
+ # Module used for compression of files. An including
7
+ # <tt>IMW::Files::BasicFile</tt> object gains +compress+ and
8
+ # <tt>compress!</tt> methods.
9
+ #
10
+ # By default, bzip2 is used for compression though gzip can also be
11
+ # specified (the full list of known compression programs is in
12
+ # <tt>IMW::Files::Compressible::COMPRESSION_PROGS</tt>). Zip and Rar
13
+ # compression are handled by the <tt>IMW::Files::Archive</tt> module.
14
+ #
15
+ # Decompression should be handled via the
16
+ # <tt>IMW::Files::CompressedFile</tt> class.
17
+ #
18
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
19
+ # Copyright:: Copyright (c) 2008 infochimps.org
20
+ # License:: GPL 3.0
21
+ # Website:: http://infinitemonkeywrench.org/
22
+ #
23
+ # puts "#{File.basename(__FILE__)}: Why is it that when you squeeze a lemon you get lemonade but when you squeeze a banana you just get a mess?" # at bottom
24
+ module IMW
25
+ module Files
26
+ module Compressible
27
+
28
+ # Known compression programs.
29
+ COMPRESSION_PROGS = [:bzip2, :gzip]
30
+
31
+ # Extensions that are appended by each compression program.
32
+ COMPRESSION_EXTS = {
33
+ :bzip2 => '.bz2',
34
+ :gzip => '.gz'
35
+ }
36
+
37
+ # Compression flags for each program
38
+ COMPRESSION_FLAGS = {
39
+ :bzip2 => "-f",
40
+ :gzip => "-f"
41
+ }
42
+
43
+ protected
44
+ # Check that +program+ is a valid compression program.
45
+ def ensure_valid_compression_program program
46
+ raise IMW::Error.new("#{program} is not a valid compression program (#{COMPRESSION_PROGS.join(' ,')}).") unless COMPRESSION_PROGS.include? program
47
+ end
48
+
49
+ # Construct the command passed to the shell to compress this
50
+ # file using the given +program+.
51
+ def compression_command program
52
+ ensure_valid_compression_program program
53
+ [IMW::EXTERNAL_PROGRAMS[program],COMPRESSION_FLAGS[program],self.path].join ' '
54
+ end
55
+
56
+ # Return the object representing this file compressed with
57
+ # +program+.
58
+ def compressed_file_path program
59
+ ensure_valid_compression_program program
60
+ path = File.join(self.dirname,self.basename + COMPRESSION_EXTS[program])
61
+ end
62
+
63
+ public
64
+ # Compress this file in its present directory using +program+,
65
+ # overwriting any existing compressed files and without saving
66
+ # the original file. Returns an
67
+ # <tt>IMW::Files::CompressedFile</tt> object corresponding to
68
+ # the compressed file.
69
+ #
70
+ # Options:
71
+ #
72
+ # <tt>:program</tt> (<tt>:bzip2</tt>):: names the compression
73
+ # program from the choices in <tt>IMW::EXTERNAL_PROGRAMS</tt>.
74
+ def compress! program = :bzip2
75
+ raise IMW::PathError.new("cannot compress #{@path}, doesn't exist!") unless exist?
76
+ FileUtils.cd(@dirname) { IMW.system(self.compression_command(program)) }
77
+ IMW.open(self.compressed_file_path(program))
78
+ end
79
+
80
+ # Compress this file in its present directory, overwriting any
81
+ # existing compressed files while keeping the original file.
82
+ # Returns an <tt>IMW::Files::CompressedFile</tt> object
83
+ # corresponding to the compressed file.
84
+ #
85
+ # Options:
86
+ #
87
+ # <tt>:program</tt> (<tt>:bzip2</tt>):: names the compression
88
+ # program from the choices in <tt>IMW::EXTERNAL_PROGRAMS</tt>.
89
+ def compress program = :bzip2
90
+ raise IMW::PathError.new("cannot compress #{@path}, doesn't exist!") unless exist?
91
+ begin
92
+ FileUtils.cp(self.path,self.path + 'copy')
93
+ compress! program
94
+ ensure
95
+ FileUtils.mv(self.path + 'copy',self.path)
96
+ end
97
+ IMW.open(self.compressed_file_path(program))
98
+ end
99
+
100
+ end
101
+ end
102
+ end
103
+
@@ -0,0 +1,112 @@
1
+ #
2
+ # h2. lib/imw/files/csv.rb -- CSV, TSV files
3
+ #
4
+ # == About
5
+ #
6
+ # For "comma-separated value" (CSV) and "tab-separated value" (TSV)
7
+ # files.
8
+ #
9
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
10
+ # Copyright:: Copyright (c) 2008 infochimps.org
11
+ # License:: GPL 3.0
12
+ # Website:: http://infinitemonkeywrench.org/
13
+ #
14
+ # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
15
+
16
+ require 'fastercsv'
17
+ module IMW
18
+ module Files
19
+
20
+ # A base class from which to subclass various types of tabular
21
+ # data files (CSV, TSV, &c.)
22
+ class TabularDataFile < FasterCSV
23
+
24
+ include IMW::Files::BasicFile
25
+ include IMW::Files::Compressible
26
+
27
+ # Default options to be passed to
28
+ # FasterCSV[http://fastercsv.rubyforge.org/]; see its
29
+ # documentation for more information.
30
+ DEFAULT_OPTIONS = {
31
+ :col_sep => ',',
32
+ :headers => false,
33
+ :return_headers => false,
34
+ :write_headers => true,
35
+ :skip_blanks => false,
36
+ :force_quotes => false
37
+ }
38
+
39
+ def initialize uri, mode='r', options = {}
40
+ options.reverse_merge!(self.class::DEFAULT_OPTIONS)
41
+ self.uri= uri
42
+ super open(uri,mode),options
43
+ end
44
+
45
+ # Return the contents of this CSV file as an array of arrays.
46
+ def load
47
+ entries
48
+ end
49
+
50
+ # Dump +data+ to this file.
51
+ #
52
+ # Options include:
53
+ # <tt>:flush</tt> (true):: flush the file buffer, writing it to disk
54
+ # <tt>:close</tt> (true):: close the file after writing +data+
55
+ def dump data, options = {}
56
+ options = options.reverse_merge :close => true, :flush => true
57
+ data.each {|row| self << row}
58
+ self.flush if options[:flush]
59
+ self.close if options[:close]
60
+ self
61
+ end
62
+
63
+ # Return a random sample of rows.
64
+ def sample length=10
65
+ rows, indices = [], Set.new
66
+ begin
67
+ each_with_index do |row, index|
68
+ break if rows.size == length
69
+ next if index != 0 && rand < 0.75 # skip 3/4 of rows after the 1st
70
+ rows << row
71
+ indices << index
72
+ end
73
+ # now fill up to length if not there already
74
+ while rows.length < length
75
+ each_with_index do |row, index|
76
+ break if rows.size == length
77
+ next if index indices.include?(index)
78
+ rows << row
79
+ end
80
+ end
81
+ rows
82
+ rescue FasterCSV::MalformedCSVError
83
+ rows
84
+ end
85
+ end
86
+ end
87
+
88
+
89
+ # Represents a file of comma-separated values (CSV). This class
90
+ # is a subclass of <tt>FasterCSV</tt> so the methods of that
91
+ # library are available for use.
92
+ #
93
+ # See <tt>IMW::Files::TabularDataFile</tt> for more complete
94
+ # documentation.
95
+ class Csv < TabularDataFile
96
+ end
97
+
98
+ # Represents a file of tab-separated values (TSV). This class
99
+ # is a subclass of <tt>FasterCSV</tt> so the methods of that
100
+ # library are available for use.
101
+ #
102
+ # See <tt>IMW::Files::TabularDataFile</tt> for more complete
103
+ # documentation.
104
+ class Tsv < TabularDataFile
105
+ DEFAULT_OPTIONS = {:col_sep => "\t"}.reverse_merge DEFAULT_OPTIONS
106
+ end
107
+
108
+ FILE_REGEXPS << [/\.csv$/, IMW::Files::Csv]
109
+ FILE_REGEXPS << [/\.tsv$/, IMW::Files::Tsv]
110
+
111
+ end
112
+ end
@@ -0,0 +1,41 @@
1
+
2
+ # h2. lib/imw/files/json.rb -- describes json files
3
+ #
4
+ # == About
5
+ #
6
+ # A class for working with JSON files.
7
+ #
8
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
9
+ # Copyright:: Copyright (c) 2008 infochimps.org
10
+ # License:: GPL 3.0
11
+ # Website:: http://infinitemonkeywrench.org/
12
+ #
13
+ # puts "#{File.basename(__FILE__)}: Yet another clever comment." # at bottobm
14
+
15
+ require 'json'
16
+ require 'imw/files/text'
17
+
18
+ module IMW
19
+ module Files
20
+
21
+ class Json < IMW::Files::Text
22
+
23
+ def initialize uri, mode='r', options = {}
24
+ super uri, mode
25
+ end
26
+
27
+ # Return the contents of this JSON file.
28
+ #
29
+ # FIXME what to do if a block is passed in?
30
+ def load &block
31
+ JSON.parse File.new(@path).read
32
+ end
33
+
34
+ # Dump +data+ to this file as JSON.
35
+ def dump data
36
+ super data.to_json
37
+ end
38
+ end
39
+ FILE_REGEXPS << [/\.json$/, IMW::Files::Json]
40
+ end
41
+ end
@@ -0,0 +1,65 @@
1
+ #
2
+ # h2. lib/imw/files/sgml.rb -- SGML files
3
+ #
4
+ # == About
5
+ #
6
+ # For SGML-derived files, including XML, HTML, &c..
7
+ #
8
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
9
+ # Copyright:: Copyright (c) 2008 infochimps.org
10
+ # License:: GPL 3.0
11
+ # Website:: http://infinitemonkeywrench.org/
12
+ #
13
+ # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
14
+
15
+ require 'hpricot'
16
+ require 'imw/files/text'
17
+ require 'imw/parsers/html_parser'
18
+
19
+ module IMW
20
+ module Files
21
+
22
+ module Sgml
23
+
24
+ attr_accessor :doc
25
+
26
+ def initialize uri, mode='r', options={}
27
+ super uri, mode, options
28
+ @doc = Hpricot(open(uri))
29
+ end
30
+
31
+ # Delegate to Hpricot
32
+ def method_missing method, *args, &block
33
+ @doc.send method, *args, &block
34
+ end
35
+
36
+ # Parse this file using the IMW HTMLParser. The parser can
37
+ # either be passed in directly or constructed from a passed hash
38
+ # of matchers.
39
+ def parse *args
40
+ parser = args.first.is_a?(IMW::HTMLParser) ? args.first : IMW::HTMLParser.new(*args)
41
+ parser.parse(self)
42
+ end
43
+
44
+ end
45
+
46
+ class Xml < IMW::Files::Text
47
+ include Sgml
48
+ def initialize uri, mode='r', options={}
49
+ super uri, mode, options
50
+ @doc = Hpricot.XML(open(uri))
51
+ end
52
+ end
53
+
54
+ class Html < IMW::Files::Text
55
+ include Sgml
56
+ def initialize uri, mode='r', options={}
57
+ super uri, mode, options
58
+ @doc = Hpricot(open(uri))
59
+ end
60
+ end
61
+ end
62
+ end
63
+
64
+
65
+
@@ -0,0 +1,68 @@
1
+ module IMW
2
+ module Files
3
+
4
+ # Used to process text files when no more specialized class is suitable.
5
+ #
6
+ # f = IMW::Files::Text.new '/path/to/my_file.dat'
7
+ # f.load do |line|
8
+ # # ...
9
+ # end
10
+ #
11
+ # Missing methods will be passed to the associated file handle
12
+ # (either IO or StringIO depending on whether the URI passed in
13
+ # was local or remote) so the usual stuff like read or each_line
14
+ # still works.
15
+ class Text
16
+
17
+ include IMW::Files::BasicFile
18
+ include IMW::Files::Compressible
19
+
20
+ attr_reader :file, :parser
21
+
22
+ def initialize uri, mode='r', options = {}
23
+ self.uri= uri
24
+ raise IMW::PathError.new("Cannot write to remote file #{uri}") if mode == 'w' && remote?
25
+ @file = open(uri, mode)
26
+ end
27
+
28
+ # Return the contents of this text file as a string.
29
+ def load
30
+ file.read
31
+ end
32
+
33
+ # Return an array with each line of this file. If given a
34
+ # block, pass each line to the block.
35
+ def entries &block
36
+ if block_given?
37
+ file.each do |line|
38
+ yield line.chomp
39
+ end
40
+ else
41
+ file.map do |line|
42
+ line.chomp
43
+ end
44
+ end
45
+ end
46
+
47
+ # Dump +data+ to this file as a string. Close the file handle
48
+ # if passed in :close.
49
+ def dump data, options={}
50
+ file.write(data.inspect)
51
+ file.close if options[:close]
52
+ end
53
+
54
+ def method_missing method, *args
55
+ file.send method, *args
56
+ end
57
+
58
+ def parse parser_spec, &block
59
+ lines = parser_spec.delete(:lines)
60
+ @parser = IMW::Parsers::RegexpParser.new(parser_spec)
61
+ parser.parse!(file, {:lines => lines}, &block)
62
+ end
63
+
64
+ end
65
+ end
66
+ end
67
+
68
+ # puts "#{File.basename(__FILE__)}: Don't forget to put a nametag on your Monkeywrench or one of the other chimps might steal it!" # at bottom
@@ -0,0 +1,46 @@
1
+ #
2
+ # h2. lib/imw/files/yaml.rb -- describes yaml files
3
+ #
4
+ # == About
5
+ #
6
+ # A class for working with YAML files.
7
+ #
8
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
9
+ # Copyright:: Copyright (c) 2008 infochimps.org
10
+ # License:: GPL 3.0
11
+ # Website:: http://infinitemonkeywrench.org/
12
+ #
13
+
14
+ require 'yaml'
15
+ require 'imw/files/text'
16
+
17
+ module IMW
18
+ module Files
19
+
20
+ class Yaml < IMW::Files::Text
21
+
22
+ def initialize uri, mode='r', options = {}
23
+ super uri, mode
24
+ end
25
+
26
+ # Return the contents of this YAML file.
27
+ #
28
+ # FIXME what to do if a block is passed in?
29
+ def load &block
30
+ YAML.load_file @path
31
+ end
32
+
33
+ # Dump +data+ to this file as YAML.
34
+ def dump data
35
+ super data.to_yaml
36
+ end
37
+
38
+ end
39
+
40
+ FILE_REGEXPS << [/\.yaml$/, IMW::Files::Yaml]
41
+ FILE_REGEXPS << [/\.yml$/, IMW::Files::Yaml]
42
+
43
+ end
44
+ end
45
+
46
+ # puts "#{File.basename(__FILE__)}: Yet another clever comment." # at bottobm
@@ -0,0 +1,8 @@
1
+ module IMW
2
+ module Packagers
3
+ autoload :Archiver, 'imw/packagers/archiver'
4
+ autoload :S3Mover, 'imw/packagers/s3_mover'
5
+ end
6
+ end
7
+
8
+