iostreams 0.14.0 → 0.15.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +202 -0
  3. data/README.md +155 -47
  4. data/lib/io_streams/file/reader.rb +7 -8
  5. data/lib/io_streams/file/writer.rb +7 -8
  6. data/lib/io_streams/io_streams.rb +313 -129
  7. data/lib/io_streams/{delimited → line}/reader.rb +20 -30
  8. data/lib/io_streams/line/writer.rb +81 -0
  9. data/lib/io_streams/pgp.rb +4 -14
  10. data/lib/io_streams/record/reader.rb +55 -0
  11. data/lib/io_streams/record/writer.rb +63 -0
  12. data/lib/io_streams/row/reader.rb +60 -0
  13. data/lib/io_streams/row/writer.rb +62 -0
  14. data/lib/io_streams/s3.rb +25 -0
  15. data/lib/io_streams/s3/reader.rb +64 -0
  16. data/lib/io_streams/s3/writer.rb +13 -0
  17. data/lib/io_streams/streams.rb +1 -1
  18. data/lib/io_streams/tabular.rb +163 -0
  19. data/lib/io_streams/tabular/errors.rb +14 -0
  20. data/lib/io_streams/tabular/header.rb +146 -0
  21. data/lib/io_streams/tabular/parser/array.rb +26 -0
  22. data/lib/io_streams/tabular/parser/base.rb +12 -0
  23. data/lib/io_streams/tabular/parser/csv.rb +35 -0
  24. data/lib/io_streams/tabular/parser/fixed.rb +88 -0
  25. data/lib/io_streams/tabular/parser/hash.rb +21 -0
  26. data/lib/io_streams/tabular/parser/json.rb +25 -0
  27. data/lib/io_streams/tabular/parser/psv.rb +34 -0
  28. data/lib/io_streams/tabular/utility/csv_row.rb +115 -0
  29. data/lib/io_streams/version.rb +2 -2
  30. data/lib/io_streams/xlsx/reader.rb +1 -1
  31. data/lib/io_streams/zip/reader.rb +1 -1
  32. data/lib/io_streams/zip/writer.rb +1 -1
  33. data/lib/iostreams.rb +21 -10
  34. data/test/bzip2_reader_test.rb +21 -22
  35. data/test/bzip2_writer_test.rb +38 -32
  36. data/test/file_reader_test.rb +19 -18
  37. data/test/file_writer_test.rb +23 -22
  38. data/test/files/test.json +3 -0
  39. data/test/gzip_reader_test.rb +21 -22
  40. data/test/gzip_writer_test.rb +35 -29
  41. data/test/io_streams_test.rb +137 -61
  42. data/test/line_reader_test.rb +105 -0
  43. data/test/line_writer_test.rb +50 -0
  44. data/test/pgp_reader_test.rb +29 -29
  45. data/test/pgp_test.rb +149 -195
  46. data/test/pgp_writer_test.rb +63 -62
  47. data/test/record_reader_test.rb +61 -0
  48. data/test/record_writer_test.rb +73 -0
  49. data/test/row_reader_test.rb +34 -0
  50. data/test/row_writer_test.rb +51 -0
  51. data/test/tabular_test.rb +184 -0
  52. data/test/xlsx_reader_test.rb +13 -17
  53. data/test/zip_reader_test.rb +21 -22
  54. data/test/zip_writer_test.rb +40 -36
  55. metadata +41 -17
  56. data/lib/io_streams/csv/reader.rb +0 -21
  57. data/lib/io_streams/csv/writer.rb +0 -20
  58. data/lib/io_streams/delimited/writer.rb +0 -67
  59. data/test/csv_reader_test.rb +0 -34
  60. data/test/csv_writer_test.rb +0 -35
  61. data/test/delimited_reader_test.rb +0 -115
  62. data/test/delimited_writer_test.rb +0 -44
@@ -0,0 +1,25 @@
1
+ begin
2
+ require 'aws-sdk-s3'
3
+ rescue LoadError => exc
4
+ raise(LoadError, "Install gem 'aws-sdk-s3' to read and write AWS S3 files: #{exc.message}")
5
+ end
6
+
7
+ require 'uri'
8
+ module IOStreams
9
+ module S3
10
+ # Sample URI: s3://mybucket/user/abc.zip
11
+ def self.parse_uri(uri)
12
+ # 's3://mybucket/user/abc.zip'
13
+ uri = URI.parse(uri)
14
+ # Filename and bucket only
15
+ if uri.scheme.nil?
16
+ segments = uri.path.split('/')
17
+ raise "S3 URI must at the very least contain '<bucket_name>/<key>'" if (segments.size == 1) || (segments[0] == '')
18
+ {
19
+ bucket: segments.shift,
20
+ key: segments.join('/')
21
+ }
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,64 @@
1
+ module IOStreams
2
+ module S3
3
+ class Reader
4
+ # Read from a AWS S3 file
5
+ def self.open(uri = nil, bucket: nil, region: nil, key: nil, &block)
6
+ options = uri.nil? ? args : parse_uri(uri).merge(args)
7
+ s3 = region.nil? ? Aws::S3::Resource.new : Aws::S3::Resource.new(region: region)
8
+ object = s3.bucket(options[:bucket]).object(options[:key])
9
+
10
+ IO.pipe do |read_io, write_io|
11
+ object.get(response_target: write_io)
12
+ write_io.close
13
+ block.call(read_io)
14
+ end
15
+ end
16
+
17
+ def self.open2(uri = nil, **args, &block)
18
+ if !uri.nil? && IOStreams.reader_stream?(uri)
19
+ raise(ArgumentError, 'S3 can only accept a URI, not an IO stream when reading.')
20
+ end
21
+
22
+ unless defined?(Aws::S3::Resource)
23
+ begin
24
+ require 'aws-sdk-s3'
25
+ rescue LoadError => exc
26
+ raise(LoadError, "Install gem 'aws-sdk-s3' to read and write AWS S3 files: #{exc.message}")
27
+ end
28
+ end
29
+
30
+ options = uri.nil? ? args : parse_uri(uri).merge(args)
31
+
32
+ begin
33
+ io = new(**options)
34
+ block.call(io)
35
+ ensure
36
+ io.close if io && (io.respond_to?(:closed?) && !io.closed?)
37
+ end
38
+ end
39
+
40
+ def initialize(region: nil, bucket:, key:)
41
+ s3 = region.nil? ? Aws::S3::Resource.new : Aws::S3::Resource.new(region: region)
42
+ @object = s3.bucket(bucket).object(key)
43
+ @buffer = []
44
+ end
45
+
46
+ def read(length = nil, outbuf = nil)
47
+ # Sufficient data already in the buffer
48
+ return @buffer.slice!(0, length) if length && (length <= @buffer.length)
49
+
50
+ # Fetch in chunks
51
+ @object.get do |chunk|
52
+ @buffer << chunk
53
+ return @buffer.slice!(0, length) if length && (length <= @buffer.length)
54
+ end
55
+ @buffer if @buffer.size > 0
56
+ end
57
+
58
+ private
59
+
60
+ attr_reader :object
61
+
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,13 @@
1
+ module IOStreams
2
+ module S3
3
+ class Writer
4
+ # Write to AWS S3
5
+ def self.open(uri = nil, bucket: nil, region: nil, key: nil, &block)
6
+ options = uri.nil? ? args : parse_uri(uri).merge(args)
7
+ s3 = region.nil? ? Aws::S3::Resource.new : Aws::S3::Resource.new(region: region)
8
+ object = s3.bucket(options[:bucket]).object(options[:key])
9
+ object.upload_stream(file_name_or_io, &block)
10
+ end
11
+ end
12
+ end
13
+ end
@@ -92,7 +92,7 @@ module IOStreams
92
92
  # RocketJob::Formatter::Formats.streams_for_file_name('myfile.csv')
93
93
  # => [ :file ]
94
94
  def streams_for_file_name(file_name)
95
- raise ArgumentError.new("RocketJob Cannot detect file format when uploading to stream: #{file_name.inspect}") if reader_stream?(file_name)
95
+ raise ArgumentError.new("Cannot auto-detect streams when already a stream: #{file_name.inspect}") if reader_stream?(file_name)
96
96
 
97
97
  parts = file_name.split('.')
98
98
  extensions = []
@@ -0,0 +1,163 @@
1
+ module IOStreams
2
+ # Common handling for efficiently processing tabular data such as CSV, spreadsheet or other tabular files
3
+ # on a line by line basis.
4
+ #
5
+ # Tabular consists of a table of data where the first row is usually the header, and subsequent
6
+ # rows are the data elements.
7
+ #
8
+ # Tabular applies the header information to every row of data when #as_hash is called.
9
+ #
10
+ # Example using the default CSV parser:
11
+ #
12
+ # tabular = Tabular.new
13
+ # tabular.parse_header("first field,Second,thirD")
14
+ # # => ["first field", "Second", "thirD"]
15
+ #
16
+ # tabular.cleanse_header!
17
+ # # => ["first_field", "second", "third"]
18
+ #
19
+ # tabular.record_parse("1,2,3")
20
+ # # => {"first_field"=>"1", "second"=>"2", "third"=>"3"}
21
+ #
22
+ # tabular.record_parse([1,2,3])
23
+ # # => {"first_field"=>1, "second"=>2, "third"=>3}
24
+ #
25
+ # tabular.render([5,6,9])
26
+ # # => "5,6,9"
27
+ #
28
+ # tabular.render({"third"=>"3", "first_field"=>"1" })
29
+ # # => "1,,3"
30
+ class Tabular
31
+ autoload :Errors, 'io_streams/tabular/errors'
32
+ autoload :Header, 'io_streams/tabular/header'
33
+
34
+ module Parser
35
+ autoload :Array, 'io_streams/tabular/parser/array'
36
+ autoload :Base, 'io_streams/tabular/parser/base'
37
+ autoload :Csv, 'io_streams/tabular/parser/csv'
38
+ autoload :Fixed, 'io_streams/tabular/parser/fixed'
39
+ autoload :Hash, 'io_streams/tabular/parser/hash'
40
+ autoload :Json, 'io_streams/tabular/parser/json'
41
+ autoload :Psv, 'io_streams/tabular/parser/psv'
42
+ end
43
+
44
+ module Utility
45
+ autoload :CSVRow, 'io_streams/tabular/utility/csv_row'
46
+ end
47
+
48
+ attr_reader :format, :header, :parser
49
+
50
+ # Parse a delimited data source.
51
+ #
52
+ # Parameters
53
+ # format: [Symbol]
54
+ # :csv, :hash, :array, :json, :psv, :fixed
55
+ #
56
+ # For all other parameters, see Tabular::Header.new
57
+ def initialize(format: nil, file_name: nil, **args)
58
+ @header = Header.new(**args)
59
+ klass =
60
+ if file_name && format.nil?
61
+ self.class.parser_class_for_file_name(file_name)
62
+ else
63
+ self.class.parser_class(format)
64
+ end
65
+ @parser = klass.new
66
+ end
67
+
68
+ # Returns [true|false] whether a header row needs to be read first.
69
+ def requires_header?
70
+ parser.requires_header? && IOStreams.blank?(header.columns)
71
+ end
72
+
73
+ # Returns [Array] the header row/line after parsing and cleansing.
74
+ # Returns `nil` if the row/line is blank, or a header is not required for the supplied format (:json, :hash).
75
+ #
76
+ # Notes:
77
+ # * Call `parse_header?` first to determine if the header should be parsed first.
78
+ # * The header columns are set after parsing the row, but the header is not cleansed.
79
+ def parse_header(line)
80
+ return if IOStreams.blank?(line) || !parser.requires_header?
81
+
82
+ header.columns = parser.parse(line)
83
+ end
84
+
85
+ # Returns [Hash<String,Object>] the line as a hash.
86
+ # Returns nil if the line is blank.
87
+ def record_parse(line)
88
+ line = row_parse(line)
89
+ header.to_hash(line) if line
90
+ end
91
+
92
+ # Returns [Array] the row/line as a parsed Array of values.
93
+ # Returns nil if the row/line is blank.
94
+ def row_parse(line)
95
+ return if IOStreams.blank?(line)
96
+
97
+ parser.parse(line)
98
+ end
99
+
100
+ # Renders the output row
101
+ def render(row)
102
+ return if IOStreams.blank?(row)
103
+
104
+ parser.render(row, header)
105
+ end
106
+
107
+ # Returns [Array<String>] the cleansed columns
108
+ def cleanse_header!
109
+ header.cleanse!
110
+ header.columns
111
+ end
112
+
113
+ # Register a file extension and the reader and writer classes to use to format it
114
+ #
115
+ # Example:
116
+ # # MyXls::Reader and MyXls::Writer must implement .open
117
+ # register_extension(:xls, MyXls::Reader, MyXls::Writer)
118
+ def self.register_extension(extension, parser)
119
+ raise(ArgumentError, "Invalid extension #{extension.inspect}") unless extension.nil? || extension.to_s =~ /\A\w+\Z/
120
+ @extensions[extension.nil? ? nil : extension.to_sym] = parser
121
+ end
122
+
123
+ # De-Register a file extension
124
+ #
125
+ # Returns [Symbol] the extension removed, or nil if the extension was not registered
126
+ #
127
+ # Example:
128
+ # register_extension(:xls)
129
+ def self.deregister_extension(extension)
130
+ raise(ArgumentError, "Invalid extension #{extension.inspect}") unless extension.to_s =~ /\A\w+\Z/
131
+ @extensions.delete(extension.to_sym)
132
+ end
133
+
134
+ private
135
+
136
+ # A registry to hold formats for processing files during upload or download
137
+ @extensions = {}
138
+
139
+ def self.parser_class(format)
140
+ @extensions[format.nil? ? nil : format.to_sym] || raise(ArgumentError, "Unknown Tabular Format: #{format.inspect}")
141
+ end
142
+
143
+ # Returns the parser to use with tabular for the supplied file_name
144
+ def self.parser_class_for_file_name(file_name)
145
+ extension = nil
146
+ file_name.to_s.split('.').reverse_each do |ext|
147
+ if @extensions.include?(ext.to_sym)
148
+ extension = ext.to_sym
149
+ break
150
+ end
151
+ end
152
+ parser_class(extension)
153
+ end
154
+
155
+ register_extension(nil, IOStreams::Tabular::Parser::Csv)
156
+ register_extension(:array, IOStreams::Tabular::Parser::Array)
157
+ register_extension(:csv, IOStreams::Tabular::Parser::Csv)
158
+ register_extension(:fixed, IOStreams::Tabular::Parser::Fixed)
159
+ register_extension(:hash, IOStreams::Tabular::Parser::Hash)
160
+ register_extension(:json, IOStreams::Tabular::Parser::Json)
161
+ register_extension(:psv, IOStreams::Tabular::Parser::Psv)
162
+ end
163
+ end
@@ -0,0 +1,14 @@
1
+ module IOStreams
2
+ class Tabular
3
+ module Errors
4
+ class Error < StandardError;
5
+ end
6
+
7
+ class InvalidHeader < Error;
8
+ end
9
+
10
+ class TypeMismatch < Error;
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,146 @@
1
+ module IOStreams
2
+ class Tabular
3
+ # Process files / streams that start with a header.
4
+ class Header
5
+ attr_accessor :columns, :allowed_columns, :required_columns, :skip_unknown
6
+
7
+ # Header
8
+ #
9
+ # Parameters
10
+ # columns [Array<String>]
11
+ # Columns in this header.
12
+ # Note:
13
+ # It is recommended to keep all columns as strings to avoid any issues when persistence
14
+ # with MongoDB when it converts symbol keys to strings.
15
+ #
16
+ # allowed_columns [Array<String>]
17
+ # List of columns to allow.
18
+ # Default: nil ( Allow all columns )
19
+ # Note:
20
+ # When supplied any columns that are rejected will be returned in the cleansed columns
21
+ # as nil so that they can be ignored during processing.
22
+ #
23
+ # required_columns [Array<String>]
24
+ # List of columns that must be present, otherwise an Exception is raised.
25
+ #
26
+ # skip_unknown [true|false]
27
+ # true:
28
+ # Skip columns not present in the whitelist by cleansing them to nil.
29
+ # #as_hash will skip these additional columns entirely as if they were not in the file at all.
30
+ # false:
31
+ # Raises Tabular::InvalidHeader when a column is supplied that is not in the whitelist.
32
+ def initialize(columns: nil, allowed_columns: nil, required_columns: nil, skip_unknown: true)
33
+ @columns = columns
34
+ @required_columns = required_columns
35
+ @allowed_columns = allowed_columns
36
+ @skip_unknown = skip_unknown
37
+ end
38
+
39
+ # Returns [Array<String>] list columns that were ignored during cleansing.
40
+ #
41
+ # Each column is cleansed as follows:
42
+ # - Leading and trailing whitespace is stripped.
43
+ # - All characters converted to lower case.
44
+ # - Spaces and '-' are converted to '_'.
45
+ # - All characters except for letters, digits, and '_' are stripped.
46
+ #
47
+ # Notes
48
+ # * Raises Tabular::InvalidHeader when there are no non-nil columns left after cleansing.
49
+ def cleanse!
50
+ return [] if columns.nil? || columns.empty?
51
+
52
+ ignored_columns = []
53
+ self.columns = columns.collect do |column|
54
+ cleansed = cleanse_column(column)
55
+ if allowed_columns.nil? || allowed_columns.include?(cleansed)
56
+ cleansed
57
+ else
58
+ ignored_columns << column
59
+ nil
60
+ end
61
+ end
62
+
63
+ if !skip_unknown && !ignored_columns.empty?
64
+ raise(IOStreams::Tabular::Errors::InvalidHeader, "Unknown columns after cleansing: #{ignored_columns.join(',')}")
65
+ end
66
+
67
+ if ignored_columns.size == columns.size
68
+ raise(IOStreams::Tabular::Errors::InvalidHeader, "All columns are unknown after cleansing: #{ignored_columns.join(',')}")
69
+ end
70
+
71
+ if required_columns
72
+ missing_columns = required_columns - columns
73
+ unless missing_columns.empty?
74
+ raise(IOStreams::Tabular::Errors::InvalidHeader, "Missing columns after cleansing: #{missing_columns.join(',')}")
75
+ end
76
+ end
77
+
78
+ ignored_columns
79
+ end
80
+
81
+ # Marshal to Hash from Array or Hash by applying this header
82
+ #
83
+ # Parameters:
84
+ # cleanse [true|false]
85
+ # Whether to cleanse and narrow the supplied hash to just those columns in this header.
86
+ # Only Applies to when the hash is already a Hash.
87
+ # Useful to turn off narrowing when the input data is already trusted.
88
+ def to_hash(row, cleanse = true)
89
+ return if IOStreams.blank?(row)
90
+
91
+ case row
92
+ when Array
93
+ raise(Tabular::Errors::InvalidHeader, "Missing mandatory header when trying to convert a row into a hash") unless columns
94
+ array_to_hash(row)
95
+ when Hash
96
+ cleanse && columns ? cleanse_hash(row) : row
97
+ else
98
+ raise(Tabular::Errors::TypeMismatch, "Don't know how to convert #{row.class.name} to a Hash")
99
+ end
100
+ end
101
+
102
+ def to_array(row, cleanse = true)
103
+ if row.is_a?(Hash) && columns
104
+ row = cleanse_hash(row) if cleanse
105
+ row = columns.collect { |column| row[column] }
106
+ end
107
+ raise(Tabular::Errors::TypeMismatch, "Don't know how to convert #{row.class.name} to an Array without the header columns being set.") unless row.is_a?(Array)
108
+ row
109
+ end
110
+
111
+ private
112
+
113
+ def array_to_hash(row)
114
+ h = {}
115
+ columns.each_with_index { |col, i| h[col] = row[i] unless IOStreams.blank?(col) }
116
+ h
117
+ end
118
+
119
+ # Perform cleansing on returned Hash keys during the narrowing process.
120
+ # For example, avoids issues with case etc.
121
+ def cleanse_hash(hash)
122
+ h = {}
123
+ hash.each_pair do |key, value|
124
+ cleansed_key =
125
+ if columns.include?(key)
126
+ key
127
+ else
128
+ key = cleanse_column(key)
129
+ key if columns.include?(key)
130
+ end
131
+ h[cleansed_key] = value if cleansed_key
132
+ end
133
+ h
134
+ end
135
+
136
+ def cleanse_column(name)
137
+ cleansed = name.to_s.strip.downcase
138
+ cleansed.gsub!(/\s+/, '_')
139
+ cleansed.gsub!(/-+/, '_')
140
+ cleansed.gsub!(/\W+/, '')
141
+ cleansed
142
+ end
143
+
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,26 @@
1
+ require 'json'
2
+ module IOStreams
3
+ class Tabular
4
+ module Parser
5
+ class Array < Base
6
+ # Returns [Array<String>] the header row.
7
+ # Returns nil if the row is blank.
8
+ def parse_header(row)
9
+ raise(Tabular::Errors::InvalidHeader, "Format is :array. Invalid input header: #{row.class.name}") unless row.is_a?(::Array)
10
+ row
11
+ end
12
+
13
+ # Returns Array
14
+ def parse(row)
15
+ raise(Tabular::Errors::TypeMismatch, "Format is :array. Invalid input: #{row.class.name}") unless row.is_a?(::Array)
16
+ row
17
+ end
18
+
19
+ def render(row, header)
20
+ header.to_array(row)
21
+ end
22
+
23
+ end
24
+ end
25
+ end
26
+ end