iostreams 0.14.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +202 -0
  3. data/README.md +155 -47
  4. data/lib/io_streams/file/reader.rb +7 -8
  5. data/lib/io_streams/file/writer.rb +7 -8
  6. data/lib/io_streams/io_streams.rb +313 -129
  7. data/lib/io_streams/{delimited → line}/reader.rb +20 -30
  8. data/lib/io_streams/line/writer.rb +81 -0
  9. data/lib/io_streams/pgp.rb +4 -14
  10. data/lib/io_streams/record/reader.rb +55 -0
  11. data/lib/io_streams/record/writer.rb +63 -0
  12. data/lib/io_streams/row/reader.rb +60 -0
  13. data/lib/io_streams/row/writer.rb +62 -0
  14. data/lib/io_streams/s3.rb +25 -0
  15. data/lib/io_streams/s3/reader.rb +64 -0
  16. data/lib/io_streams/s3/writer.rb +13 -0
  17. data/lib/io_streams/streams.rb +1 -1
  18. data/lib/io_streams/tabular.rb +163 -0
  19. data/lib/io_streams/tabular/errors.rb +14 -0
  20. data/lib/io_streams/tabular/header.rb +146 -0
  21. data/lib/io_streams/tabular/parser/array.rb +26 -0
  22. data/lib/io_streams/tabular/parser/base.rb +12 -0
  23. data/lib/io_streams/tabular/parser/csv.rb +35 -0
  24. data/lib/io_streams/tabular/parser/fixed.rb +88 -0
  25. data/lib/io_streams/tabular/parser/hash.rb +21 -0
  26. data/lib/io_streams/tabular/parser/json.rb +25 -0
  27. data/lib/io_streams/tabular/parser/psv.rb +34 -0
  28. data/lib/io_streams/tabular/utility/csv_row.rb +115 -0
  29. data/lib/io_streams/version.rb +2 -2
  30. data/lib/io_streams/xlsx/reader.rb +1 -1
  31. data/lib/io_streams/zip/reader.rb +1 -1
  32. data/lib/io_streams/zip/writer.rb +1 -1
  33. data/lib/iostreams.rb +21 -10
  34. data/test/bzip2_reader_test.rb +21 -22
  35. data/test/bzip2_writer_test.rb +38 -32
  36. data/test/file_reader_test.rb +19 -18
  37. data/test/file_writer_test.rb +23 -22
  38. data/test/files/test.json +3 -0
  39. data/test/gzip_reader_test.rb +21 -22
  40. data/test/gzip_writer_test.rb +35 -29
  41. data/test/io_streams_test.rb +137 -61
  42. data/test/line_reader_test.rb +105 -0
  43. data/test/line_writer_test.rb +50 -0
  44. data/test/pgp_reader_test.rb +29 -29
  45. data/test/pgp_test.rb +149 -195
  46. data/test/pgp_writer_test.rb +63 -62
  47. data/test/record_reader_test.rb +61 -0
  48. data/test/record_writer_test.rb +73 -0
  49. data/test/row_reader_test.rb +34 -0
  50. data/test/row_writer_test.rb +51 -0
  51. data/test/tabular_test.rb +184 -0
  52. data/test/xlsx_reader_test.rb +13 -17
  53. data/test/zip_reader_test.rb +21 -22
  54. data/test/zip_writer_test.rb +40 -36
  55. metadata +41 -17
  56. data/lib/io_streams/csv/reader.rb +0 -21
  57. data/lib/io_streams/csv/writer.rb +0 -20
  58. data/lib/io_streams/delimited/writer.rb +0 -67
  59. data/test/csv_reader_test.rb +0 -34
  60. data/test/csv_writer_test.rb +0 -35
  61. data/test/delimited_reader_test.rb +0 -115
  62. data/test/delimited_writer_test.rb +0 -44
@@ -0,0 +1,25 @@
1
+ begin
2
+ require 'aws-sdk-s3'
3
+ rescue LoadError => exc
4
+ raise(LoadError, "Install gem 'aws-sdk-s3' to read and write AWS S3 files: #{exc.message}")
5
+ end
6
+
7
+ require 'uri'
8
+ module IOStreams
9
+ module S3
10
+ # Sample URI: s3://mybucket/user/abc.zip
11
+ def self.parse_uri(uri)
12
+ # 's3://mybucket/user/abc.zip'
13
+ uri = URI.parse(uri)
14
+ # Filename and bucket only
15
+ if uri.scheme.nil?
16
+ segments = uri.path.split('/')
17
+ raise "S3 URI must at the very least contain '<bucket_name>/<key>'" if (segments.size == 1) || (segments[0] == '')
18
+ {
19
+ bucket: segments.shift,
20
+ key: segments.join('/')
21
+ }
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,64 @@
1
+ module IOStreams
2
+ module S3
3
+ class Reader
4
+ # Read from a AWS S3 file
5
+ def self.open(uri = nil, bucket: nil, region: nil, key: nil, &block)
6
+ options = uri.nil? ? args : parse_uri(uri).merge(args)
7
+ s3 = region.nil? ? Aws::S3::Resource.new : Aws::S3::Resource.new(region: region)
8
+ object = s3.bucket(options[:bucket]).object(options[:key])
9
+
10
+ IO.pipe do |read_io, write_io|
11
+ object.get(response_target: write_io)
12
+ write_io.close
13
+ block.call(read_io)
14
+ end
15
+ end
16
+
17
+ def self.open2(uri = nil, **args, &block)
18
+ if !uri.nil? && IOStreams.reader_stream?(uri)
19
+ raise(ArgumentError, 'S3 can only accept a URI, not an IO stream when reading.')
20
+ end
21
+
22
+ unless defined?(Aws::S3::Resource)
23
+ begin
24
+ require 'aws-sdk-s3'
25
+ rescue LoadError => exc
26
+ raise(LoadError, "Install gem 'aws-sdk-s3' to read and write AWS S3 files: #{exc.message}")
27
+ end
28
+ end
29
+
30
+ options = uri.nil? ? args : parse_uri(uri).merge(args)
31
+
32
+ begin
33
+ io = new(**options)
34
+ block.call(io)
35
+ ensure
36
+ io.close if io && (io.respond_to?(:closed?) && !io.closed?)
37
+ end
38
+ end
39
+
40
+ def initialize(region: nil, bucket:, key:)
41
+ s3 = region.nil? ? Aws::S3::Resource.new : Aws::S3::Resource.new(region: region)
42
+ @object = s3.bucket(bucket).object(key)
43
+ @buffer = []
44
+ end
45
+
46
+ def read(length = nil, outbuf = nil)
47
+ # Sufficient data already in the buffer
48
+ return @buffer.slice!(0, length) if length && (length <= @buffer.length)
49
+
50
+ # Fetch in chunks
51
+ @object.get do |chunk|
52
+ @buffer << chunk
53
+ return @buffer.slice!(0, length) if length && (length <= @buffer.length)
54
+ end
55
+ @buffer if @buffer.size > 0
56
+ end
57
+
58
+ private
59
+
60
+ attr_reader :object
61
+
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,13 @@
1
+ module IOStreams
2
+ module S3
3
+ class Writer
4
+ # Write to AWS S3
5
+ def self.open(uri = nil, bucket: nil, region: nil, key: nil, &block)
6
+ options = uri.nil? ? args : parse_uri(uri).merge(args)
7
+ s3 = region.nil? ? Aws::S3::Resource.new : Aws::S3::Resource.new(region: region)
8
+ object = s3.bucket(options[:bucket]).object(options[:key])
9
+ object.upload_stream(file_name_or_io, &block)
10
+ end
11
+ end
12
+ end
13
+ end
@@ -92,7 +92,7 @@ module IOStreams
92
92
  # RocketJob::Formatter::Formats.streams_for_file_name('myfile.csv')
93
93
  # => [ :file ]
94
94
  def streams_for_file_name(file_name)
95
- raise ArgumentError.new("RocketJob Cannot detect file format when uploading to stream: #{file_name.inspect}") if reader_stream?(file_name)
95
+ raise ArgumentError.new("Cannot auto-detect streams when already a stream: #{file_name.inspect}") if reader_stream?(file_name)
96
96
 
97
97
  parts = file_name.split('.')
98
98
  extensions = []
@@ -0,0 +1,163 @@
1
+ module IOStreams
2
+ # Common handling for efficiently processing tabular data such as CSV, spreadsheet or other tabular files
3
+ # on a line by line basis.
4
+ #
5
+ # Tabular consists of a table of data where the first row is usually the header, and subsequent
6
+ # rows are the data elements.
7
+ #
8
+ # Tabular applies the header information to every row of data when #as_hash is called.
9
+ #
10
+ # Example using the default CSV parser:
11
+ #
12
+ # tabular = Tabular.new
13
+ # tabular.parse_header("first field,Second,thirD")
14
+ # # => ["first field", "Second", "thirD"]
15
+ #
16
+ # tabular.cleanse_header!
17
+ # # => ["first_field", "second", "third"]
18
+ #
19
+ # tabular.record_parse("1,2,3")
20
+ # # => {"first_field"=>"1", "second"=>"2", "third"=>"3"}
21
+ #
22
+ # tabular.record_parse([1,2,3])
23
+ # # => {"first_field"=>1, "second"=>2, "third"=>3}
24
+ #
25
+ # tabular.render([5,6,9])
26
+ # # => "5,6,9"
27
+ #
28
+ # tabular.render({"third"=>"3", "first_field"=>"1" })
29
+ # # => "1,,3"
30
+ class Tabular
31
+ autoload :Errors, 'io_streams/tabular/errors'
32
+ autoload :Header, 'io_streams/tabular/header'
33
+
34
+ module Parser
35
+ autoload :Array, 'io_streams/tabular/parser/array'
36
+ autoload :Base, 'io_streams/tabular/parser/base'
37
+ autoload :Csv, 'io_streams/tabular/parser/csv'
38
+ autoload :Fixed, 'io_streams/tabular/parser/fixed'
39
+ autoload :Hash, 'io_streams/tabular/parser/hash'
40
+ autoload :Json, 'io_streams/tabular/parser/json'
41
+ autoload :Psv, 'io_streams/tabular/parser/psv'
42
+ end
43
+
44
+ module Utility
45
+ autoload :CSVRow, 'io_streams/tabular/utility/csv_row'
46
+ end
47
+
48
+ attr_reader :format, :header, :parser
49
+
50
+ # Parse a delimited data source.
51
+ #
52
+ # Parameters
53
+ # format: [Symbol]
54
+ # :csv, :hash, :array, :json, :psv, :fixed
55
+ #
56
+ # For all other parameters, see Tabular::Header.new
57
+ def initialize(format: nil, file_name: nil, **args)
58
+ @header = Header.new(**args)
59
+ klass =
60
+ if file_name && format.nil?
61
+ self.class.parser_class_for_file_name(file_name)
62
+ else
63
+ self.class.parser_class(format)
64
+ end
65
+ @parser = klass.new
66
+ end
67
+
68
+ # Returns [true|false] whether a header row needs to be read first.
69
+ def requires_header?
70
+ parser.requires_header? && IOStreams.blank?(header.columns)
71
+ end
72
+
73
+ # Returns [Array] the header row/line after parsing and cleansing.
74
+ # Returns `nil` if the row/line is blank, or a header is not required for the supplied format (:json, :hash).
75
+ #
76
+ # Notes:
77
+ # * Call `parse_header?` first to determine if the header should be parsed first.
78
+ # * The header columns are set after parsing the row, but the header is not cleansed.
79
+ def parse_header(line)
80
+ return if IOStreams.blank?(line) || !parser.requires_header?
81
+
82
+ header.columns = parser.parse(line)
83
+ end
84
+
85
+ # Returns [Hash<String,Object>] the line as a hash.
86
+ # Returns nil if the line is blank.
87
+ def record_parse(line)
88
+ line = row_parse(line)
89
+ header.to_hash(line) if line
90
+ end
91
+
92
+ # Returns [Array] the row/line as a parsed Array of values.
93
+ # Returns nil if the row/line is blank.
94
+ def row_parse(line)
95
+ return if IOStreams.blank?(line)
96
+
97
+ parser.parse(line)
98
+ end
99
+
100
+ # Renders the output row
101
+ def render(row)
102
+ return if IOStreams.blank?(row)
103
+
104
+ parser.render(row, header)
105
+ end
106
+
107
+ # Returns [Array<String>] the cleansed columns
108
+ def cleanse_header!
109
+ header.cleanse!
110
+ header.columns
111
+ end
112
+
113
+ # Register a file extension and the reader and writer classes to use to format it
114
+ #
115
+ # Example:
116
+ # # MyXls::Reader and MyXls::Writer must implement .open
117
+ # register_extension(:xls, MyXls::Reader, MyXls::Writer)
118
+ def self.register_extension(extension, parser)
119
+ raise(ArgumentError, "Invalid extension #{extension.inspect}") unless extension.nil? || extension.to_s =~ /\A\w+\Z/
120
+ @extensions[extension.nil? ? nil : extension.to_sym] = parser
121
+ end
122
+
123
+ # De-Register a file extension
124
+ #
125
+ # Returns [Symbol] the extension removed, or nil if the extension was not registered
126
+ #
127
+ # Example:
128
+ # register_extension(:xls)
129
+ def self.deregister_extension(extension)
130
+ raise(ArgumentError, "Invalid extension #{extension.inspect}") unless extension.to_s =~ /\A\w+\Z/
131
+ @extensions.delete(extension.to_sym)
132
+ end
133
+
134
+ private
135
+
136
+ # A registry to hold formats for processing files during upload or download
137
+ @extensions = {}
138
+
139
+ def self.parser_class(format)
140
+ @extensions[format.nil? ? nil : format.to_sym] || raise(ArgumentError, "Unknown Tabular Format: #{format.inspect}")
141
+ end
142
+
143
+ # Returns the parser to use with tabular for the supplied file_name
144
+ def self.parser_class_for_file_name(file_name)
145
+ extension = nil
146
+ file_name.to_s.split('.').reverse_each do |ext|
147
+ if @extensions.include?(ext.to_sym)
148
+ extension = ext.to_sym
149
+ break
150
+ end
151
+ end
152
+ parser_class(extension)
153
+ end
154
+
155
+ register_extension(nil, IOStreams::Tabular::Parser::Csv)
156
+ register_extension(:array, IOStreams::Tabular::Parser::Array)
157
+ register_extension(:csv, IOStreams::Tabular::Parser::Csv)
158
+ register_extension(:fixed, IOStreams::Tabular::Parser::Fixed)
159
+ register_extension(:hash, IOStreams::Tabular::Parser::Hash)
160
+ register_extension(:json, IOStreams::Tabular::Parser::Json)
161
+ register_extension(:psv, IOStreams::Tabular::Parser::Psv)
162
+ end
163
+ end
@@ -0,0 +1,14 @@
1
+ module IOStreams
2
+ class Tabular
3
+ module Errors
4
+ class Error < StandardError;
5
+ end
6
+
7
+ class InvalidHeader < Error;
8
+ end
9
+
10
+ class TypeMismatch < Error;
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,146 @@
1
+ module IOStreams
2
+ class Tabular
3
+ # Process files / streams that start with a header.
4
+ class Header
5
+ attr_accessor :columns, :allowed_columns, :required_columns, :skip_unknown
6
+
7
+ # Header
8
+ #
9
+ # Parameters
10
+ # columns [Array<String>]
11
+ # Columns in this header.
12
+ # Note:
13
+ # It is recommended to keep all columns as strings to avoid any issues when persistence
14
+ # with MongoDB when it converts symbol keys to strings.
15
+ #
16
+ # allowed_columns [Array<String>]
17
+ # List of columns to allow.
18
+ # Default: nil ( Allow all columns )
19
+ # Note:
20
+ # When supplied any columns that are rejected will be returned in the cleansed columns
21
+ # as nil so that they can be ignored during processing.
22
+ #
23
+ # required_columns [Array<String>]
24
+ # List of columns that must be present, otherwise an Exception is raised.
25
+ #
26
+ # skip_unknown [true|false]
27
+ # true:
28
+ # Skip columns not present in the whitelist by cleansing them to nil.
29
+ # #as_hash will skip these additional columns entirely as if they were not in the file at all.
30
+ # false:
31
+ # Raises Tabular::InvalidHeader when a column is supplied that is not in the whitelist.
32
+ def initialize(columns: nil, allowed_columns: nil, required_columns: nil, skip_unknown: true)
33
+ @columns = columns
34
+ @required_columns = required_columns
35
+ @allowed_columns = allowed_columns
36
+ @skip_unknown = skip_unknown
37
+ end
38
+
39
+ # Returns [Array<String>] list columns that were ignored during cleansing.
40
+ #
41
+ # Each column is cleansed as follows:
42
+ # - Leading and trailing whitespace is stripped.
43
+ # - All characters converted to lower case.
44
+ # - Spaces and '-' are converted to '_'.
45
+ # - All characters except for letters, digits, and '_' are stripped.
46
+ #
47
+ # Notes
48
+ # * Raises Tabular::InvalidHeader when there are no non-nil columns left after cleansing.
49
+ def cleanse!
50
+ return [] if columns.nil? || columns.empty?
51
+
52
+ ignored_columns = []
53
+ self.columns = columns.collect do |column|
54
+ cleansed = cleanse_column(column)
55
+ if allowed_columns.nil? || allowed_columns.include?(cleansed)
56
+ cleansed
57
+ else
58
+ ignored_columns << column
59
+ nil
60
+ end
61
+ end
62
+
63
+ if !skip_unknown && !ignored_columns.empty?
64
+ raise(IOStreams::Tabular::Errors::InvalidHeader, "Unknown columns after cleansing: #{ignored_columns.join(',')}")
65
+ end
66
+
67
+ if ignored_columns.size == columns.size
68
+ raise(IOStreams::Tabular::Errors::InvalidHeader, "All columns are unknown after cleansing: #{ignored_columns.join(',')}")
69
+ end
70
+
71
+ if required_columns
72
+ missing_columns = required_columns - columns
73
+ unless missing_columns.empty?
74
+ raise(IOStreams::Tabular::Errors::InvalidHeader, "Missing columns after cleansing: #{missing_columns.join(',')}")
75
+ end
76
+ end
77
+
78
+ ignored_columns
79
+ end
80
+
81
+ # Marshal to Hash from Array or Hash by applying this header
82
+ #
83
+ # Parameters:
84
+ # cleanse [true|false]
85
+ # Whether to cleanse and narrow the supplied hash to just those columns in this header.
86
+ # Only Applies to when the hash is already a Hash.
87
+ # Useful to turn off narrowing when the input data is already trusted.
88
+ def to_hash(row, cleanse = true)
89
+ return if IOStreams.blank?(row)
90
+
91
+ case row
92
+ when Array
93
+ raise(Tabular::Errors::InvalidHeader, "Missing mandatory header when trying to convert a row into a hash") unless columns
94
+ array_to_hash(row)
95
+ when Hash
96
+ cleanse && columns ? cleanse_hash(row) : row
97
+ else
98
+ raise(Tabular::Errors::TypeMismatch, "Don't know how to convert #{row.class.name} to a Hash")
99
+ end
100
+ end
101
+
102
+ def to_array(row, cleanse = true)
103
+ if row.is_a?(Hash) && columns
104
+ row = cleanse_hash(row) if cleanse
105
+ row = columns.collect { |column| row[column] }
106
+ end
107
+ raise(Tabular::Errors::TypeMismatch, "Don't know how to convert #{row.class.name} to an Array without the header columns being set.") unless row.is_a?(Array)
108
+ row
109
+ end
110
+
111
+ private
112
+
113
+ def array_to_hash(row)
114
+ h = {}
115
+ columns.each_with_index { |col, i| h[col] = row[i] unless IOStreams.blank?(col) }
116
+ h
117
+ end
118
+
119
+ # Perform cleansing on returned Hash keys during the narrowing process.
120
+ # For example, avoids issues with case etc.
121
+ def cleanse_hash(hash)
122
+ h = {}
123
+ hash.each_pair do |key, value|
124
+ cleansed_key =
125
+ if columns.include?(key)
126
+ key
127
+ else
128
+ key = cleanse_column(key)
129
+ key if columns.include?(key)
130
+ end
131
+ h[cleansed_key] = value if cleansed_key
132
+ end
133
+ h
134
+ end
135
+
136
+ def cleanse_column(name)
137
+ cleansed = name.to_s.strip.downcase
138
+ cleansed.gsub!(/\s+/, '_')
139
+ cleansed.gsub!(/-+/, '_')
140
+ cleansed.gsub!(/\W+/, '')
141
+ cleansed
142
+ end
143
+
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,26 @@
1
+ require 'json'
2
+ module IOStreams
3
+ class Tabular
4
+ module Parser
5
+ class Array < Base
6
+ # Returns [Array<String>] the header row.
7
+ # Returns nil if the row is blank.
8
+ def parse_header(row)
9
+ raise(Tabular::Errors::InvalidHeader, "Format is :array. Invalid input header: #{row.class.name}") unless row.is_a?(::Array)
10
+ row
11
+ end
12
+
13
+ # Returns Array
14
+ def parse(row)
15
+ raise(Tabular::Errors::TypeMismatch, "Format is :array. Invalid input: #{row.class.name}") unless row.is_a?(::Array)
16
+ row
17
+ end
18
+
19
+ def render(row, header)
20
+ header.to_array(row)
21
+ end
22
+
23
+ end
24
+ end
25
+ end
26
+ end