iostreams 0.14.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +202 -0
  3. data/README.md +155 -47
  4. data/lib/io_streams/file/reader.rb +7 -8
  5. data/lib/io_streams/file/writer.rb +7 -8
  6. data/lib/io_streams/io_streams.rb +313 -129
  7. data/lib/io_streams/{delimited → line}/reader.rb +20 -30
  8. data/lib/io_streams/line/writer.rb +81 -0
  9. data/lib/io_streams/pgp.rb +4 -14
  10. data/lib/io_streams/record/reader.rb +55 -0
  11. data/lib/io_streams/record/writer.rb +63 -0
  12. data/lib/io_streams/row/reader.rb +60 -0
  13. data/lib/io_streams/row/writer.rb +62 -0
  14. data/lib/io_streams/s3.rb +25 -0
  15. data/lib/io_streams/s3/reader.rb +64 -0
  16. data/lib/io_streams/s3/writer.rb +13 -0
  17. data/lib/io_streams/streams.rb +1 -1
  18. data/lib/io_streams/tabular.rb +163 -0
  19. data/lib/io_streams/tabular/errors.rb +14 -0
  20. data/lib/io_streams/tabular/header.rb +146 -0
  21. data/lib/io_streams/tabular/parser/array.rb +26 -0
  22. data/lib/io_streams/tabular/parser/base.rb +12 -0
  23. data/lib/io_streams/tabular/parser/csv.rb +35 -0
  24. data/lib/io_streams/tabular/parser/fixed.rb +88 -0
  25. data/lib/io_streams/tabular/parser/hash.rb +21 -0
  26. data/lib/io_streams/tabular/parser/json.rb +25 -0
  27. data/lib/io_streams/tabular/parser/psv.rb +34 -0
  28. data/lib/io_streams/tabular/utility/csv_row.rb +115 -0
  29. data/lib/io_streams/version.rb +2 -2
  30. data/lib/io_streams/xlsx/reader.rb +1 -1
  31. data/lib/io_streams/zip/reader.rb +1 -1
  32. data/lib/io_streams/zip/writer.rb +1 -1
  33. data/lib/iostreams.rb +21 -10
  34. data/test/bzip2_reader_test.rb +21 -22
  35. data/test/bzip2_writer_test.rb +38 -32
  36. data/test/file_reader_test.rb +19 -18
  37. data/test/file_writer_test.rb +23 -22
  38. data/test/files/test.json +3 -0
  39. data/test/gzip_reader_test.rb +21 -22
  40. data/test/gzip_writer_test.rb +35 -29
  41. data/test/io_streams_test.rb +137 -61
  42. data/test/line_reader_test.rb +105 -0
  43. data/test/line_writer_test.rb +50 -0
  44. data/test/pgp_reader_test.rb +29 -29
  45. data/test/pgp_test.rb +149 -195
  46. data/test/pgp_writer_test.rb +63 -62
  47. data/test/record_reader_test.rb +61 -0
  48. data/test/record_writer_test.rb +73 -0
  49. data/test/row_reader_test.rb +34 -0
  50. data/test/row_writer_test.rb +51 -0
  51. data/test/tabular_test.rb +184 -0
  52. data/test/xlsx_reader_test.rb +13 -17
  53. data/test/zip_reader_test.rb +21 -22
  54. data/test/zip_writer_test.rb +40 -36
  55. metadata +41 -17
  56. data/lib/io_streams/csv/reader.rb +0 -21
  57. data/lib/io_streams/csv/writer.rb +0 -20
  58. data/lib/io_streams/delimited/writer.rb +0 -67
  59. data/test/csv_reader_test.rb +0 -34
  60. data/test/csv_writer_test.rb +0 -35
  61. data/test/delimited_reader_test.rb +0 -115
  62. data/test/delimited_writer_test.rb +0 -44
@@ -0,0 +1,12 @@
1
+ module IOStreams
2
+ class Tabular
3
+ module Parser
4
+ class Base
5
+ # Returns [true|false] whether a header row is required for this format.
6
+ def requires_header?
7
+ true
8
+ end
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,35 @@
1
+ module IOStreams
2
+ class Tabular
3
+ module Parser
4
+ class Csv < Base
5
+ attr_reader :csv_parser
6
+
7
+ def initialize
8
+ @csv_parser = Utility::CSVRow.new
9
+ end
10
+
11
+ # Returns [Array<String>] the header row.
12
+ # Returns nil if the row is blank.
13
+ def parse_header(row)
14
+ raise(Tabular::Errors::InvalidHeader, "Format is :csv. Invalid input header: #{row.class.name}") unless row.is_a?(String)
15
+
16
+ csv_parser.parse(row)
17
+ end
18
+
19
+ # Returns [Array] the parsed CSV line
20
+ def parse(row)
21
+ raise(Tabular::Errors::TypeMismatch, "Format is :csv. Invalid input: #{row.class.name}") unless row.is_a?(String)
22
+
23
+ csv_parser.parse(row)
24
+ end
25
+
26
+ # Return the supplied array as a single line CSV string.
27
+ def render(row, header)
28
+ array = header.to_array(row)
29
+ csv_parser.to_csv(array)
30
+ end
31
+
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,88 @@
1
+ module IOStreams
2
+ class Tabular
3
+ module Parser
4
+ # Parsing and rendering fixed length data
5
+ class Fixed < Base
6
+ attr_reader :encoding, :encoding_options, :fixed_format
7
+
8
+ # Returns [IOStreams::Tabular::Parser]
9
+ #
10
+ # Arguments:
11
+ # format: [Array<Hash>]
12
+ # [
13
+ # {key: 'name', size: 23 },
14
+ # {key: 'address', size: 40 },
15
+ # {key: 'zip', size: 5 }
16
+ # ]
17
+ #
18
+ # encoding: [String|Encoding]
19
+ # nil: Don't perform any encoding conversion
20
+ # 'ASCII': ASCII Format
21
+ # 'UTF-8': UTF-8 Format
22
+ # Etc.
23
+ # Default: nil
24
+ #
25
+ # replacement: [String]
26
+ # The character to replace with when a character cannot be converted to the target encoding.
27
+ # nil: Don't replace any invalid characters. Encoding::UndefinedConversionError is raised.
28
+ # Default: nil
29
+ def initialize(format:, encoding: nil, replacement: nil)
30
+ @encoding = encoding.nil? || encoding.is_a?(Encoding) ? encoding : Encoding.find(encoding)
31
+ @encoding_options = replacement.nil? ? {} : {invalid: :replace, undef: :replace, replace: replacement}
32
+ @fixed_format = parse_format(format)
33
+ end
34
+
35
+ # Returns [String] fixed format values extracted from the supplied hash.
36
+ # String will be encoded to `encoding`
37
+ def render(row, header)
38
+ hash = header.to_hash(row)
39
+
40
+ result = encoding.nil? ? '' : ''.encode(encoding)
41
+ fixed_format.each do |map|
42
+ # A nil value is considered an empty string
43
+ value = hash[map.key].to_s
44
+ result <<
45
+ if encoding
46
+ format("%-#{map.size}.#{map.size}s".encode(encoding), value.encode(encoding, encoding_options))
47
+ else
48
+ format("%-#{map.size}.#{map.size}s", value)
49
+ end
50
+ end
51
+ result
52
+ end
53
+
54
+ # Returns [Hash<Symbol, String>] fixed format values extracted from the supplied line.
55
+ # String will be encoded to `encoding`
56
+ def parse(line)
57
+ unless line.is_a?(String)
58
+ raise(Tabular::Errors::TypeMismatch, "Format is :fixed. Invalid parse input: #{line.class.name}")
59
+ end
60
+
61
+ hash = {}
62
+ index = 0
63
+ fixed_format.each do |map|
64
+ value = line[index..(index + map.size - 1)]
65
+ index += map.size
66
+ hash[map.key] = encoding.nil? ? value.strip : value.strip.encode(encoding, encoding_options)
67
+ end
68
+ hash
69
+ end
70
+
71
+ private
72
+
73
+ FixedFormat = Struct.new(:key, :size)
74
+
75
+ # Returns [Array<FixedFormat>] the format for this fixed width file.
76
+ # Also validates values
77
+ def parse_format(format)
78
+ format.collect do |map|
79
+ size = map[:size]
80
+ key = map[:key]
81
+ raise(ArgumentError, "Missing required :key and :size in: #{map.inspect}") unless size && key
82
+ FixedFormat.new(key, size)
83
+ end
84
+ end
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,21 @@
1
+ require 'json'
2
+ module IOStreams
3
+ class Tabular
4
+ module Parser
5
+ class Hash < Base
6
+ def parse(row)
7
+ raise(Tabular::Errors::TypeMismatch, "Format is :hash. Invalid input: #{row.class.name}") unless row.is_a?(::Hash)
8
+ row
9
+ end
10
+
11
+ def render(row, header)
12
+ header.to_hash(row)
13
+ end
14
+
15
+ def requires_header?
16
+ false
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,25 @@
1
+ require 'json'
2
+ module IOStreams
3
+ class Tabular
4
+ module Parser
5
+ # For parsing a single line of JSON at a time
6
+ class Json < Base
7
+ def parse(row)
8
+ raise(Tabular::Errors::TypeMismatch, "Format is :json. Invalid input: #{row.class.name}") unless row.is_a?(String)
9
+
10
+ JSON.parse(row)
11
+ end
12
+
13
+ # Return the supplied array as a single line JSON string.
14
+ def render(row, header)
15
+ hash = header.to_hash(row)
16
+ hash.to_json
17
+ end
18
+
19
+ def requires_header?
20
+ false
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,34 @@
1
+ module IOStreams
2
+ class Tabular
3
+ module Parser
4
+ # For parsing a single line of Pipe-separated values
5
+ class Psv < Base
6
+ # Returns [Array<String>] the header row.
7
+ # Returns nil if the row is blank.
8
+ def parse_header(row)
9
+ unless row.is_a?(String)
10
+ raise(Tabular::Errors::InvalidHeader, "Format is :psv. Invalid input header: #{row.class.name}")
11
+ end
12
+
13
+ row.split('|')
14
+ end
15
+
16
+ # Returns [Array] the parsed PSV line
17
+ def parse(row)
18
+ raise(Tabular::Errors::TypeMismatch, "Format is :psv. Invalid input: #{row.class.name}") unless row.is_a?(String)
19
+
20
+ row.split('|')
21
+ end
22
+
23
+ # Return the supplied array as a single line JSON string.
24
+ def render(row, header)
25
+ array = header.to_array(row)
26
+ cleansed_array = array.collect do |i|
27
+ i.is_a?(String) ? i.tr('|', ':') : i
28
+ end
29
+ cleansed_array.join('|')
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,115 @@
1
+ require 'csv'
2
+ module IOStreams
3
+ class Tabular
4
+ module Utility
5
+ # For parsing a single line of CSV at a time
6
+ # 2 to 3 times better performance than CSV.parse_line and considerably less
7
+ # garbage collection required.
8
+ #
9
+ # Note:
10
+ # This parser does not support line feeds embedded in quoted fields since
11
+ # the file is broken apart based on line feeds during the upload process and
12
+ # is then processed by each worker on a line by line basis.
13
+ class CSVRow < ::CSV
14
+ UTF8_ENCODING = Encoding.find('UTF-8').freeze
15
+
16
+ def initialize(encoding = UTF8_ENCODING)
17
+ @io = StringIO.new(''.force_encoding(encoding))
18
+ super(@io, row_sep: '')
19
+ end
20
+
21
+ # Parse a single line of CSV data
22
+ # Parameters
23
+ # line [String]
24
+ # A single line of CSV data without any line terminators
25
+ def parse(line)
26
+ return if IOStreams.blank?(line)
27
+ return if @skip_lines and @skip_lines.match line
28
+
29
+ in_extended_col = false
30
+ csv = Array.new
31
+ parts = line.split(@col_sep, -1)
32
+ csv << nil if parts.empty?
33
+
34
+ # This loop is the hot path of csv parsing. Some things may be non-dry
35
+ # for a reason. Make sure to benchmark when refactoring.
36
+ parts.each do |part|
37
+ if in_extended_col
38
+ # If we are continuing a previous column
39
+ if part[-1] == @quote_char && part.count(@quote_char) % 2 != 0
40
+ # extended column ends
41
+ csv.last << part[0..-2]
42
+ if csv.last =~ @parsers[:stray_quote]
43
+ raise MalformedCSVError, "Missing or stray quote in line #{lineno + 1}"
44
+ end
45
+ csv.last.gsub!(@quote_char * 2, @quote_char)
46
+ in_extended_col = false
47
+ else
48
+ csv.last << part
49
+ csv.last << @col_sep
50
+ end
51
+ elsif part[0] == @quote_char
52
+ # If we are starting a new quoted column
53
+ if part[-1] != @quote_char || part.count(@quote_char) % 2 != 0
54
+ # start an extended column
55
+ csv << part[1..-1]
56
+ csv.last << @col_sep
57
+ in_extended_col = true
58
+ else
59
+ # regular quoted column
60
+ csv << part[1..-2]
61
+ if csv.last =~ @parsers[:stray_quote]
62
+ raise MalformedCSVError, "Missing or stray quote in line #{lineno + 1}"
63
+ end
64
+ csv.last.gsub!(@quote_char * 2, @quote_char)
65
+ end
66
+ elsif part =~ @parsers[:quote_or_nl]
67
+ # Unquoted field with bad characters.
68
+ if part =~ @parsers[:nl_or_lf]
69
+ raise MalformedCSVError, "Unquoted fields do not allow \\r or \\n (line #{lineno + 1})."
70
+ else
71
+ raise MalformedCSVError, "Illegal quoting in line #{lineno + 1}."
72
+ end
73
+ else
74
+ # Regular ole unquoted field.
75
+ csv << (part.empty? ? nil : part)
76
+ end
77
+ end
78
+
79
+ # Replace tacked on @col_sep with @row_sep if we are still in an extended
80
+ # column.
81
+ csv[-1][-1] = @row_sep if in_extended_col
82
+
83
+ if in_extended_col
84
+ raise MalformedCSVError, "Unclosed quoted field on line #{lineno + 1}."
85
+ end
86
+
87
+ @lineno += 1
88
+
89
+ # save fields unconverted fields, if needed...
90
+ unconverted = csv.dup if @unconverted_fields
91
+
92
+ # convert fields, if needed...
93
+ csv = convert_fields(csv) unless @use_headers or @converters.empty?
94
+ # parse out header rows and handle CSV::Row conversions...
95
+ csv = parse_headers(csv) if @use_headers
96
+
97
+ # inject unconverted fields and accessor, if requested...
98
+ if @unconverted_fields and not csv.respond_to? :unconverted_fields
99
+ add_unconverted_fields(csv, unconverted)
100
+ end
101
+
102
+ csv
103
+ end
104
+
105
+ # Return the supplied array as a single line CSV string.
106
+ def render(row)
107
+ row.map(&@quote).join(@col_sep) + @row_sep # quote and separate
108
+ end
109
+
110
+ alias_method :to_csv, :render
111
+
112
+ end
113
+ end
114
+ end
115
+ end
@@ -1,3 +1,3 @@
1
- module IOStreams #:nodoc
2
- VERSION = '0.14.0'
1
+ module IOStreams
2
+ VERSION = '0.15.0'
3
3
  end
@@ -25,7 +25,7 @@ module IOStreams
25
25
  file_name = temp_file.to_path
26
26
 
27
27
  ::File.open(file_name, 'wb') do |file|
28
- IOStreams.copy(file_name_or_io, file, buffer_size)
28
+ IOStreams.copy(file_name_or_io, file, buffer_size: buffer_size)
29
29
  end
30
30
  else
31
31
  file_name = file_name_or_io
@@ -33,7 +33,7 @@ module IOStreams
33
33
 
34
34
  # Stream zip stream into temp file
35
35
  ::File.open(file_name, 'wb') do |file|
36
- IOStreams.copy(file_name_or_io, file, buffer_size)
36
+ IOStreams.copy(file_name_or_io, file, buffer_size: buffer_size)
37
37
  end
38
38
 
39
39
  read_file(file_name, &block)
@@ -45,7 +45,7 @@ module IOStreams
45
45
  write_file(temp_file.to_path, zip_file_name, &block)
46
46
 
47
47
  # Stream temp file into output stream
48
- IOStreams.copy(temp_file, file_name_or_io, buffer_size)
48
+ IOStreams.copy(temp_file, file_name_or_io, buffer_size: buffer_size)
49
49
  ensure
50
50
  temp_file.delete if temp_file
51
51
  end
data/lib/iostreams.rb CHANGED
@@ -1,23 +1,23 @@
1
1
  require 'io_streams/version'
2
2
  #@formatter:off
3
3
  module IOStreams
4
- module CSV
5
- autoload :Reader, 'io_streams/csv/reader'
6
- autoload :Writer, 'io_streams/csv/writer'
4
+ module Bzip2
5
+ autoload :Reader, 'io_streams/bzip2/reader'
6
+ autoload :Writer, 'io_streams/bzip2/writer'
7
7
  end
8
8
  module File
9
9
  autoload :Reader, 'io_streams/file/reader'
10
10
  autoload :Writer, 'io_streams/file/writer'
11
11
  end
12
- module Bzip2
13
- autoload :Reader, 'io_streams/bzip2/reader'
14
- autoload :Writer, 'io_streams/bzip2/writer'
15
- end
16
12
  module Gzip
17
13
  autoload :Reader, 'io_streams/gzip/reader'
18
14
  autoload :Writer, 'io_streams/gzip/writer'
19
15
  end
20
16
  autoload :Pgp, 'io_streams/pgp'
17
+ module S3
18
+ autoload :Reader, 'io_streams/s3/reader'
19
+ autoload :Writer, 'io_streams/s3/writer'
20
+ end
21
21
  module SFTP
22
22
  autoload :Reader, 'io_streams/sftp/reader'
23
23
  autoload :Writer, 'io_streams/sftp/writer'
@@ -26,12 +26,23 @@ module IOStreams
26
26
  autoload :Reader, 'io_streams/zip/reader'
27
27
  autoload :Writer, 'io_streams/zip/writer'
28
28
  end
29
- module Delimited
30
- autoload :Reader, 'io_streams/delimited/reader'
31
- autoload :Writer, 'io_streams/delimited/writer'
29
+
30
+ module Line
31
+ autoload :Reader, 'io_streams/line/reader'
32
+ autoload :Writer, 'io_streams/line/writer'
33
+ end
34
+ module Record
35
+ autoload :Reader, 'io_streams/record/reader'
36
+ autoload :Writer, 'io_streams/record/writer'
37
+ end
38
+ module Row
39
+ autoload :Reader, 'io_streams/row/reader'
40
+ autoload :Writer, 'io_streams/row/writer'
32
41
  end
33
42
  module Xlsx
34
43
  autoload :Reader, 'io_streams/xlsx/reader'
35
44
  end
45
+
46
+ autoload :Tabular, 'io_streams/tabular'
36
47
  end
37
48
  require 'io_streams/io_streams'
@@ -1,33 +1,32 @@
1
1
  require_relative 'test_helper'
2
2
 
3
- # Unit Test for IOStreams::Gzip
4
- module Streams
5
- class Bzip2ReaderTest < Minitest::Test
6
- describe IOStreams::Bzip2::Reader do
7
- before do
8
- @file_name = File.join(File.dirname(__FILE__), 'files', 'text.txt.bz2')
9
- @gzip_data = File.open(@file_name, 'rb') { |f| f.read }
10
- @data = File.read(File.join(File.dirname(__FILE__), 'files', 'text.txt'))
11
- end
3
+ class Bzip2ReaderTest < Minitest::Test
4
+ describe IOStreams::Bzip2::Reader do
5
+ let :file_name do
6
+ File.join(File.dirname(__FILE__), 'files', 'text.txt.bz2')
7
+ end
12
8
 
13
- describe '.open' do
14
- it 'file' do
15
- result = IOStreams::Bzip2::Reader.open(@file_name) do |io|
16
- io.read
17
- end
18
- assert_equal @data, result
9
+ let :decompressed do
10
+ File.read(File.join(File.dirname(__FILE__), 'files', 'text.txt'))
11
+ end
12
+
13
+ describe '.open' do
14
+ it 'file' do
15
+ result = IOStreams::Bzip2::Reader.open(file_name) do |io|
16
+ io.read
19
17
  end
18
+ assert_equal decompressed, result
19
+ end
20
20
 
21
- it 'stream' do
22
- result = File.open(@file_name) do |file|
23
- IOStreams::Bzip2::Reader.open(file) do |io|
24
- io.read
25
- end
21
+ it 'stream' do
22
+ result = File.open(file_name) do |file|
23
+ IOStreams::Bzip2::Reader.open(file) do |io|
24
+ io.read
26
25
  end
27
- assert_equal @data, result
28
26
  end
27
+ assert_equal decompressed, result
29
28
  end
30
-
31
29
  end
30
+
32
31
  end
33
32
  end