imw 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (111) hide show
  1. data/.gitignore +15 -0
  2. data/CHANGELOG +0 -0
  3. data/LICENSE +674 -0
  4. data/README.rdoc +101 -0
  5. data/Rakefile +20 -0
  6. data/VERSION +1 -0
  7. data/etc/imwrc.rb +76 -0
  8. data/lib/imw.rb +42 -0
  9. data/lib/imw/boot.rb +58 -0
  10. data/lib/imw/dataset.rb +233 -0
  11. data/lib/imw/dataset/datamapper.rb +66 -0
  12. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
  13. data/lib/imw/dataset/loaddump.rb +50 -0
  14. data/lib/imw/dataset/old/file_collection.rb +88 -0
  15. data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
  16. data/lib/imw/dataset/scaffold.rb +132 -0
  17. data/lib/imw/dataset/scraped_uri.rb +305 -0
  18. data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
  19. data/lib/imw/dataset/scrub/scrub.rb +147 -0
  20. data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
  21. data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
  22. data/lib/imw/dataset/scrub/slug.rb +101 -0
  23. data/lib/imw/dataset/stats.rb +73 -0
  24. data/lib/imw/dataset/stats/counter.rb +23 -0
  25. data/lib/imw/dataset/task.rb +38 -0
  26. data/lib/imw/dataset/workflow.rb +81 -0
  27. data/lib/imw/files.rb +110 -0
  28. data/lib/imw/files/archive.rb +113 -0
  29. data/lib/imw/files/basicfile.rb +122 -0
  30. data/lib/imw/files/binary.rb +28 -0
  31. data/lib/imw/files/compressed_file.rb +93 -0
  32. data/lib/imw/files/compressed_files_and_archives.rb +348 -0
  33. data/lib/imw/files/compressible.rb +103 -0
  34. data/lib/imw/files/csv.rb +112 -0
  35. data/lib/imw/files/json.rb +41 -0
  36. data/lib/imw/files/sgml.rb +65 -0
  37. data/lib/imw/files/text.rb +68 -0
  38. data/lib/imw/files/yaml.rb +46 -0
  39. data/lib/imw/packagers.rb +8 -0
  40. data/lib/imw/packagers/archiver.rb +108 -0
  41. data/lib/imw/packagers/s3_mover.rb +28 -0
  42. data/lib/imw/parsers.rb +7 -0
  43. data/lib/imw/parsers/html_parser.rb +382 -0
  44. data/lib/imw/parsers/html_parser/matchers.rb +306 -0
  45. data/lib/imw/parsers/line_parser.rb +87 -0
  46. data/lib/imw/parsers/regexp_parser.rb +72 -0
  47. data/lib/imw/utils.rb +24 -0
  48. data/lib/imw/utils/components.rb +61 -0
  49. data/lib/imw/utils/config.rb +46 -0
  50. data/lib/imw/utils/error.rb +54 -0
  51. data/lib/imw/utils/extensions/array.rb +125 -0
  52. data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
  53. data/lib/imw/utils/extensions/core.rb +43 -0
  54. data/lib/imw/utils/extensions/dir.rb +24 -0
  55. data/lib/imw/utils/extensions/file_core.rb +64 -0
  56. data/lib/imw/utils/extensions/hash.rb +218 -0
  57. data/lib/imw/utils/extensions/hpricot.rb +48 -0
  58. data/lib/imw/utils/extensions/string.rb +49 -0
  59. data/lib/imw/utils/extensions/struct.rb +42 -0
  60. data/lib/imw/utils/extensions/symbol.rb +28 -0
  61. data/lib/imw/utils/extensions/typed_struct.rb +22 -0
  62. data/lib/imw/utils/extensions/uri.rb +59 -0
  63. data/lib/imw/utils/log.rb +67 -0
  64. data/lib/imw/utils/misc.rb +63 -0
  65. data/lib/imw/utils/paths.rb +115 -0
  66. data/lib/imw/utils/uri.rb +59 -0
  67. data/lib/imw/utils/uuid.rb +33 -0
  68. data/lib/imw/utils/validate.rb +38 -0
  69. data/lib/imw/utils/version.rb +12 -0
  70. data/lib/imw/utils/view.rb +113 -0
  71. data/lib/imw/utils/view/dump_csv.rb +112 -0
  72. data/lib/imw/utils/view/dump_csv_older.rb +117 -0
  73. data/spec/data/sample.csv +131 -0
  74. data/spec/data/sample.tsv +131 -0
  75. data/spec/data/sample.txt +131 -0
  76. data/spec/data/sample.xml +653 -0
  77. data/spec/data/sample.yaml +652 -0
  78. data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
  79. data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
  80. data/spec/imw/files/archive_spec.rb +118 -0
  81. data/spec/imw/files/basicfile_spec.rb +121 -0
  82. data/spec/imw/files/bz2_spec.rb +32 -0
  83. data/spec/imw/files/compressed_file_spec.rb +96 -0
  84. data/spec/imw/files/compressible_spec.rb +100 -0
  85. data/spec/imw/files/file_spec.rb +144 -0
  86. data/spec/imw/files/gz_spec.rb +32 -0
  87. data/spec/imw/files/rar_spec.rb +33 -0
  88. data/spec/imw/files/tar_spec.rb +31 -0
  89. data/spec/imw/files/text_spec.rb +23 -0
  90. data/spec/imw/files/zip_spec.rb +31 -0
  91. data/spec/imw/files_spec.rb +38 -0
  92. data/spec/imw/packagers/archiver_spec.rb +125 -0
  93. data/spec/imw/packagers/s3_mover_spec.rb +7 -0
  94. data/spec/imw/parsers/line_parser_spec.rb +96 -0
  95. data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
  96. data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
  97. data/spec/imw/utils/extensions/find_spec.rb +113 -0
  98. data/spec/imw/utils/paths_spec.rb +38 -0
  99. data/spec/imw/workflow/rip/local_spec.rb +89 -0
  100. data/spec/imw/workflow/rip_spec.rb +27 -0
  101. data/spec/rcov.opts +1 -0
  102. data/spec/spec.opts +4 -0
  103. data/spec/spec_helper.rb +32 -0
  104. data/spec/support/archive_contents_matcher.rb +94 -0
  105. data/spec/support/custom_matchers.rb +21 -0
  106. data/spec/support/directory_contents_matcher.rb +61 -0
  107. data/spec/support/extensions.rb +18 -0
  108. data/spec/support/file_contents_matcher.rb +50 -0
  109. data/spec/support/random.rb +210 -0
  110. data/spec/support/without_regard_to_order_matcher.rb +58 -0
  111. metadata +196 -0
@@ -0,0 +1,103 @@
1
+ #
2
+ # h2. lib/imw/files//compressible.rb -- compression module
3
+ #
4
+ # == About
5
+ #
6
+ # Module used for compression of files. An including
7
+ # <tt>IMW::Files::BasicFile</tt> object gains +compress+ and
8
+ # <tt>compress!</tt> methods.
9
+ #
10
+ # By default, bzip2 is used for compression though gzip can also be
11
+ # specified (the full list of known compression programs is in
12
+ # <tt>IMW::Files::Compressible::COMPRESSION_PROGS</tt>). Zip and Rar
13
+ # compression are handled by the <tt>IMW::Files::Archive</tt> module.
14
+ #
15
+ # Decompression should be handled via the
16
+ # <tt>IMW::Files::CompressedFile</tt> class.
17
+ #
18
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
19
+ # Copyright:: Copyright (c) 2008 infochimps.org
20
+ # License:: GPL 3.0
21
+ # Website:: http://infinitemonkeywrench.org/
22
+ #
23
+ # puts "#{File.basename(__FILE__)}: Why is it that when you squeeze a lemon you get lemonade but when you squeeze a banana you just get a mess?" # at bottom
24
+ module IMW
25
+ module Files
26
+ module Compressible
27
+
28
+ # Known compression programs.
29
+ COMPRESSION_PROGS = [:bzip2, :gzip]
30
+
31
+ # Extensions that are appended by each compression program.
32
+ COMPRESSION_EXTS = {
33
+ :bzip2 => '.bz2',
34
+ :gzip => '.gz'
35
+ }
36
+
37
+ # Compression flags for each program
38
+ COMPRESSION_FLAGS = {
39
+ :bzip2 => "-f",
40
+ :gzip => "-f"
41
+ }
42
+
43
+ protected
44
+ # Check that +program+ is a valid compression program.
45
+ def ensure_valid_compression_program program
46
+ raise IMW::Error.new("#{program} is not a valid compression program (#{COMPRESSION_PROGS.join(' ,')}).") unless COMPRESSION_PROGS.include? program
47
+ end
48
+
49
+ # Construct the command passed to the shell to compress this
50
+ # file using the given +program+.
51
+ def compression_command program
52
+ ensure_valid_compression_program program
53
+ [IMW::EXTERNAL_PROGRAMS[program],COMPRESSION_FLAGS[program],self.path].join ' '
54
+ end
55
+
56
+ # Return the object representing this file compressed with
57
+ # +program+.
58
+ def compressed_file_path program
59
+ ensure_valid_compression_program program
60
+ path = File.join(self.dirname,self.basename + COMPRESSION_EXTS[program])
61
+ end
62
+
63
+ public
64
+ # Compress this file in its present directory using +program+,
65
+ # overwriting any existing compressed files and without saving
66
+ # the original file. Returns an
67
+ # <tt>IMW::Files::CompressedFile</tt> object corresponding to
68
+ # the compressed file.
69
+ #
70
+ # Options:
71
+ #
72
+ # <tt>:program</tt> (<tt>:bzip2</tt>):: names the compression
73
+ # program from the choices in <tt>IMW::EXTERNAL_PROGRAMS</tt>.
74
+ def compress! program = :bzip2
75
+ raise IMW::PathError.new("cannot compress #{@path}, doesn't exist!") unless exist?
76
+ FileUtils.cd(@dirname) { IMW.system(self.compression_command(program)) }
77
+ IMW.open(self.compressed_file_path(program))
78
+ end
79
+
80
+ # Compress this file in its present directory, overwriting any
81
+ # existing compressed files while keeping the original file.
82
+ # Returns an <tt>IMW::Files::CompressedFile</tt> object
83
+ # corresponding to the compressed file.
84
+ #
85
+ # Options:
86
+ #
87
+ # <tt>:program</tt> (<tt>:bzip2</tt>):: names the compression
88
+ # program from the choices in <tt>IMW::EXTERNAL_PROGRAMS</tt>.
89
+ def compress program = :bzip2
90
+ raise IMW::PathError.new("cannot compress #{@path}, doesn't exist!") unless exist?
91
+ begin
92
+ FileUtils.cp(self.path,self.path + 'copy')
93
+ compress! program
94
+ ensure
95
+ FileUtils.mv(self.path + 'copy',self.path)
96
+ end
97
+ IMW.open(self.compressed_file_path(program))
98
+ end
99
+
100
+ end
101
+ end
102
+ end
103
+
@@ -0,0 +1,112 @@
1
+ #
2
+ # h2. lib/imw/files/csv.rb -- CSV, TSV files
3
+ #
4
+ # == About
5
+ #
6
+ # For "comma-separated value" (CSV) and "tab-separated value" (TSV)
7
+ # files.
8
+ #
9
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
10
+ # Copyright:: Copyright (c) 2008 infochimps.org
11
+ # License:: GPL 3.0
12
+ # Website:: http://infinitemonkeywrench.org/
13
+ #
14
+ # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
15
+
16
+ require 'fastercsv'
17
+ module IMW
18
+ module Files
19
+
20
+ # A base class from which to subclass various types of tabular
21
+ # data files (CSV, TSV, &c.)
22
+ class TabularDataFile < FasterCSV
23
+
24
+ include IMW::Files::BasicFile
25
+ include IMW::Files::Compressible
26
+
27
+ # Default options to be passed to
28
+ # FasterCSV[http://fastercsv.rubyforge.org/]; see its
29
+ # documentation for more information.
30
+ DEFAULT_OPTIONS = {
31
+ :col_sep => ',',
32
+ :headers => false,
33
+ :return_headers => false,
34
+ :write_headers => true,
35
+ :skip_blanks => false,
36
+ :force_quotes => false
37
+ }
38
+
39
+ def initialize uri, mode='r', options = {}
40
+ options.reverse_merge!(self.class::DEFAULT_OPTIONS)
41
+ self.uri= uri
42
+ super open(uri,mode),options
43
+ end
44
+
45
+ # Return the contents of this CSV file as an array of arrays.
46
+ def load
47
+ entries
48
+ end
49
+
50
+ # Dump +data+ to this file.
51
+ #
52
+ # Options include:
53
+ # <tt>:flush</tt> (true):: flush the file buffer, writing it to disk
54
+ # <tt>:close</tt> (true):: close the file after writing +data+
55
+ def dump data, options = {}
56
+ options = options.reverse_merge :close => true, :flush => true
57
+ data.each {|row| self << row}
58
+ self.flush if options[:flush]
59
+ self.close if options[:close]
60
+ self
61
+ end
62
+
63
+ # Return a random sample of rows.
64
+ def sample length=10
65
+ rows, indices = [], Set.new
66
+ begin
67
+ each_with_index do |row, index|
68
+ break if rows.size == length
69
+ next if index != 0 && rand < 0.75 # skip 3/4 of rows after the 1st
70
+ rows << row
71
+ indices << index
72
+ end
73
+ # now fill up to length if not there already
74
+ while rows.length < length
75
+ each_with_index do |row, index|
76
+ break if rows.size == length
77
+ next if index indices.include?(index)
78
+ rows << row
79
+ end
80
+ end
81
+ rows
82
+ rescue FasterCSV::MalformedCSVError
83
+ rows
84
+ end
85
+ end
86
+ end
87
+
88
+
89
+ # Represents a file of comma-separated values (CSV). This class
90
+ # is a subclass of <tt>FasterCSV</tt> so the methods of that
91
+ # library are available for use.
92
+ #
93
+ # See <tt>IMW::Files::TabularDataFile</tt> for more complete
94
+ # documentation.
95
+ class Csv < TabularDataFile
96
+ end
97
+
98
+ # Represents a file of tab-separated values (TSV). This class
99
+ # is a subclass of <tt>FasterCSV</tt> so the methods of that
100
+ # library are available for use.
101
+ #
102
+ # See <tt>IMW::Files::TabularDataFile</tt> for more complete
103
+ # documentation.
104
+ class Tsv < TabularDataFile
105
+ DEFAULT_OPTIONS = {:col_sep => "\t"}.reverse_merge DEFAULT_OPTIONS
106
+ end
107
+
108
+ FILE_REGEXPS << [/\.csv$/, IMW::Files::Csv]
109
+ FILE_REGEXPS << [/\.tsv$/, IMW::Files::Tsv]
110
+
111
+ end
112
+ end
@@ -0,0 +1,41 @@
1
+
2
+ # h2. lib/imw/files/json.rb -- describes json files
3
+ #
4
+ # == About
5
+ #
6
+ # A class for working with JSON files.
7
+ #
8
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
9
+ # Copyright:: Copyright (c) 2008 infochimps.org
10
+ # License:: GPL 3.0
11
+ # Website:: http://infinitemonkeywrench.org/
12
+ #
13
+ # puts "#{File.basename(__FILE__)}: Yet another clever comment." # at bottobm
14
+
15
+ require 'json'
16
+ require 'imw/files/text'
17
+
18
+ module IMW
19
+ module Files
20
+
21
+ class Json < IMW::Files::Text
22
+
23
+ def initialize uri, mode='r', options = {}
24
+ super uri, mode
25
+ end
26
+
27
+ # Return the contents of this JSON file.
28
+ #
29
+ # FIXME what to do if a block is passed in?
30
+ def load &block
31
+ JSON.parse File.new(@path).read
32
+ end
33
+
34
+ # Dump +data+ to this file as JSON.
35
+ def dump data
36
+ super data.to_json
37
+ end
38
+ end
39
+ FILE_REGEXPS << [/\.json$/, IMW::Files::Json]
40
+ end
41
+ end
@@ -0,0 +1,65 @@
1
+ #
2
+ # h2. lib/imw/files/sgml.rb -- SGML files
3
+ #
4
+ # == About
5
+ #
6
+ # For SGML-derived files, including XML, HTML, &c..
7
+ #
8
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
9
+ # Copyright:: Copyright (c) 2008 infochimps.org
10
+ # License:: GPL 3.0
11
+ # Website:: http://infinitemonkeywrench.org/
12
+ #
13
+ # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
14
+
15
+ require 'hpricot'
16
+ require 'imw/files/text'
17
+ require 'imw/parsers/html_parser'
18
+
19
+ module IMW
20
+ module Files
21
+
22
+ module Sgml
23
+
24
+ attr_accessor :doc
25
+
26
+ def initialize uri, mode='r', options={}
27
+ super uri, mode, options
28
+ @doc = Hpricot(open(uri))
29
+ end
30
+
31
+ # Delegate to Hpricot
32
+ def method_missing method, *args, &block
33
+ @doc.send method, *args, &block
34
+ end
35
+
36
+ # Parse this file using the IMW HTMLParser. The parser can
37
+ # either be passed in directly or constructed from a passed hash
38
+ # of matchers.
39
+ def parse *args
40
+ parser = args.first.is_a?(IMW::HTMLParser) ? args.first : IMW::HTMLParser.new(*args)
41
+ parser.parse(self)
42
+ end
43
+
44
+ end
45
+
46
+ class Xml < IMW::Files::Text
47
+ include Sgml
48
+ def initialize uri, mode='r', options={}
49
+ super uri, mode, options
50
+ @doc = Hpricot.XML(open(uri))
51
+ end
52
+ end
53
+
54
+ class Html < IMW::Files::Text
55
+ include Sgml
56
+ def initialize uri, mode='r', options={}
57
+ super uri, mode, options
58
+ @doc = Hpricot(open(uri))
59
+ end
60
+ end
61
+ end
62
+ end
63
+
64
+
65
+
@@ -0,0 +1,68 @@
1
+ module IMW
2
+ module Files
3
+
4
+ # Used to process text files when no more specialized class is suitable.
5
+ #
6
+ # f = IMW::Files::Text.new '/path/to/my_file.dat'
7
+ # f.load do |line|
8
+ # # ...
9
+ # end
10
+ #
11
+ # Missing methods will be passed to the associated file handle
12
+ # (either IO or StringIO depending on whether the URI passed in
13
+ # was local or remote) so the usual stuff like read or each_line
14
+ # still works.
15
+ class Text
16
+
17
+ include IMW::Files::BasicFile
18
+ include IMW::Files::Compressible
19
+
20
+ attr_reader :file, :parser
21
+
22
+ def initialize uri, mode='r', options = {}
23
+ self.uri= uri
24
+ raise IMW::PathError.new("Cannot write to remote file #{uri}") if mode == 'w' && remote?
25
+ @file = open(uri, mode)
26
+ end
27
+
28
+ # Return the contents of this text file as a string.
29
+ def load
30
+ file.read
31
+ end
32
+
33
+ # Return an array with each line of this file. If given a
34
+ # block, pass each line to the block.
35
+ def entries &block
36
+ if block_given?
37
+ file.each do |line|
38
+ yield line.chomp
39
+ end
40
+ else
41
+ file.map do |line|
42
+ line.chomp
43
+ end
44
+ end
45
+ end
46
+
47
+ # Dump +data+ to this file as a string. Close the file handle
48
+ # if passed in :close.
49
+ def dump data, options={}
50
+ file.write(data.inspect)
51
+ file.close if options[:close]
52
+ end
53
+
54
+ def method_missing method, *args
55
+ file.send method, *args
56
+ end
57
+
58
+ def parse parser_spec, &block
59
+ lines = parser_spec.delete(:lines)
60
+ @parser = IMW::Parsers::RegexpParser.new(parser_spec)
61
+ parser.parse!(file, {:lines => lines}, &block)
62
+ end
63
+
64
+ end
65
+ end
66
+ end
67
+
68
+ # puts "#{File.basename(__FILE__)}: Don't forget to put a nametag on your Monkeywrench or one of the other chimps might steal it!" # at bottom
@@ -0,0 +1,46 @@
1
+ #
2
+ # h2. lib/imw/files/yaml.rb -- describes yaml files
3
+ #
4
+ # == About
5
+ #
6
+ # A class for working with YAML files.
7
+ #
8
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
9
+ # Copyright:: Copyright (c) 2008 infochimps.org
10
+ # License:: GPL 3.0
11
+ # Website:: http://infinitemonkeywrench.org/
12
+ #
13
+
14
+ require 'yaml'
15
+ require 'imw/files/text'
16
+
17
+ module IMW
18
+ module Files
19
+
20
+ class Yaml < IMW::Files::Text
21
+
22
+ def initialize uri, mode='r', options = {}
23
+ super uri, mode
24
+ end
25
+
26
+ # Return the contents of this YAML file.
27
+ #
28
+ # FIXME what to do if a block is passed in?
29
+ def load &block
30
+ YAML.load_file @path
31
+ end
32
+
33
+ # Dump +data+ to this file as YAML.
34
+ def dump data
35
+ super data.to_yaml
36
+ end
37
+
38
+ end
39
+
40
+ FILE_REGEXPS << [/\.yaml$/, IMW::Files::Yaml]
41
+ FILE_REGEXPS << [/\.yml$/, IMW::Files::Yaml]
42
+
43
+ end
44
+ end
45
+
46
+ # puts "#{File.basename(__FILE__)}: Yet another clever comment." # at bottobm
@@ -0,0 +1,8 @@
1
+ module IMW
2
+ module Packagers
3
+ autoload :Archiver, 'imw/packagers/archiver'
4
+ autoload :S3Mover, 'imw/packagers/s3_mover'
5
+ end
6
+ end
7
+
8
+