iostreams 0.14.0 → 0.15.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +202 -0
  3. data/README.md +155 -47
  4. data/lib/io_streams/file/reader.rb +7 -8
  5. data/lib/io_streams/file/writer.rb +7 -8
  6. data/lib/io_streams/io_streams.rb +313 -129
  7. data/lib/io_streams/{delimited → line}/reader.rb +20 -30
  8. data/lib/io_streams/line/writer.rb +81 -0
  9. data/lib/io_streams/pgp.rb +4 -14
  10. data/lib/io_streams/record/reader.rb +55 -0
  11. data/lib/io_streams/record/writer.rb +63 -0
  12. data/lib/io_streams/row/reader.rb +60 -0
  13. data/lib/io_streams/row/writer.rb +62 -0
  14. data/lib/io_streams/s3.rb +25 -0
  15. data/lib/io_streams/s3/reader.rb +64 -0
  16. data/lib/io_streams/s3/writer.rb +13 -0
  17. data/lib/io_streams/streams.rb +1 -1
  18. data/lib/io_streams/tabular.rb +163 -0
  19. data/lib/io_streams/tabular/errors.rb +14 -0
  20. data/lib/io_streams/tabular/header.rb +146 -0
  21. data/lib/io_streams/tabular/parser/array.rb +26 -0
  22. data/lib/io_streams/tabular/parser/base.rb +12 -0
  23. data/lib/io_streams/tabular/parser/csv.rb +35 -0
  24. data/lib/io_streams/tabular/parser/fixed.rb +88 -0
  25. data/lib/io_streams/tabular/parser/hash.rb +21 -0
  26. data/lib/io_streams/tabular/parser/json.rb +25 -0
  27. data/lib/io_streams/tabular/parser/psv.rb +34 -0
  28. data/lib/io_streams/tabular/utility/csv_row.rb +115 -0
  29. data/lib/io_streams/version.rb +2 -2
  30. data/lib/io_streams/xlsx/reader.rb +1 -1
  31. data/lib/io_streams/zip/reader.rb +1 -1
  32. data/lib/io_streams/zip/writer.rb +1 -1
  33. data/lib/iostreams.rb +21 -10
  34. data/test/bzip2_reader_test.rb +21 -22
  35. data/test/bzip2_writer_test.rb +38 -32
  36. data/test/file_reader_test.rb +19 -18
  37. data/test/file_writer_test.rb +23 -22
  38. data/test/files/test.json +3 -0
  39. data/test/gzip_reader_test.rb +21 -22
  40. data/test/gzip_writer_test.rb +35 -29
  41. data/test/io_streams_test.rb +137 -61
  42. data/test/line_reader_test.rb +105 -0
  43. data/test/line_writer_test.rb +50 -0
  44. data/test/pgp_reader_test.rb +29 -29
  45. data/test/pgp_test.rb +149 -195
  46. data/test/pgp_writer_test.rb +63 -62
  47. data/test/record_reader_test.rb +61 -0
  48. data/test/record_writer_test.rb +73 -0
  49. data/test/row_reader_test.rb +34 -0
  50. data/test/row_writer_test.rb +51 -0
  51. data/test/tabular_test.rb +184 -0
  52. data/test/xlsx_reader_test.rb +13 -17
  53. data/test/zip_reader_test.rb +21 -22
  54. data/test/zip_writer_test.rb +40 -36
  55. metadata +41 -17
  56. data/lib/io_streams/csv/reader.rb +0 -21
  57. data/lib/io_streams/csv/writer.rb +0 -20
  58. data/lib/io_streams/delimited/writer.rb +0 -67
  59. data/test/csv_reader_test.rb +0 -34
  60. data/test/csv_writer_test.rb +0 -35
  61. data/test/delimited_reader_test.rb +0 -115
  62. data/test/delimited_writer_test.rb +0 -44
@@ -0,0 +1,12 @@
1
+ module IOStreams
2
+ class Tabular
3
+ module Parser
4
+ class Base
5
+ # Returns [true|false] whether a header row is required for this format.
6
+ def requires_header?
7
+ true
8
+ end
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,35 @@
1
+ module IOStreams
2
+ class Tabular
3
+ module Parser
4
+ class Csv < Base
5
+ attr_reader :csv_parser
6
+
7
+ def initialize
8
+ @csv_parser = Utility::CSVRow.new
9
+ end
10
+
11
+ # Returns [Array<String>] the header row.
12
+ # Returns nil if the row is blank.
13
+ def parse_header(row)
14
+ raise(Tabular::Errors::InvalidHeader, "Format is :csv. Invalid input header: #{row.class.name}") unless row.is_a?(String)
15
+
16
+ csv_parser.parse(row)
17
+ end
18
+
19
+ # Returns [Array] the parsed CSV line
20
+ def parse(row)
21
+ raise(Tabular::Errors::TypeMismatch, "Format is :csv. Invalid input: #{row.class.name}") unless row.is_a?(String)
22
+
23
+ csv_parser.parse(row)
24
+ end
25
+
26
+ # Return the supplied array as a single line CSV string.
27
+ def render(row, header)
28
+ array = header.to_array(row)
29
+ csv_parser.to_csv(array)
30
+ end
31
+
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,88 @@
1
+ module IOStreams
2
+ class Tabular
3
+ module Parser
4
+ # Parsing and rendering fixed length data
5
+ class Fixed < Base
6
+ attr_reader :encoding, :encoding_options, :fixed_format
7
+
8
+ # Returns [IOStreams::Tabular::Parser]
9
+ #
10
+ # Arguments:
11
+ # format: [Array<Hash>]
12
+ # [
13
+ # {key: 'name', size: 23 },
14
+ # {key: 'address', size: 40 },
15
+ # {key: 'zip', size: 5 }
16
+ # ]
17
+ #
18
+ # encoding: [String|Encoding]
19
+ # nil: Don't perform any encoding conversion
20
+ # 'ASCII': ASCII Format
21
+ # 'UTF-8': UTF-8 Format
22
+ # Etc.
23
+ # Default: nil
24
+ #
25
+ # replacement: [String]
26
+ # The character to replace with when a character cannot be converted to the target encoding.
27
+ # nil: Don't replace any invalid characters. Encoding::UndefinedConversionError is raised.
28
+ # Default: nil
29
+ def initialize(format:, encoding: nil, replacement: nil)
30
+ @encoding = encoding.nil? || encoding.is_a?(Encoding) ? encoding : Encoding.find(encoding)
31
+ @encoding_options = replacement.nil? ? {} : {invalid: :replace, undef: :replace, replace: replacement}
32
+ @fixed_format = parse_format(format)
33
+ end
34
+
35
+ # Returns [String] fixed format values extracted from the supplied hash.
36
+ # String will be encoded to `encoding`
37
+ def render(row, header)
38
+ hash = header.to_hash(row)
39
+
40
+ result = encoding.nil? ? '' : ''.encode(encoding)
41
+ fixed_format.each do |map|
42
+ # A nil value is considered an empty string
43
+ value = hash[map.key].to_s
44
+ result <<
45
+ if encoding
46
+ format("%-#{map.size}.#{map.size}s".encode(encoding), value.encode(encoding, encoding_options))
47
+ else
48
+ format("%-#{map.size}.#{map.size}s", value)
49
+ end
50
+ end
51
+ result
52
+ end
53
+
54
+ # Returns [Hash<Symbol, String>] fixed format values extracted from the supplied line.
55
+ # String will be encoded to `encoding`
56
+ def parse(line)
57
+ unless line.is_a?(String)
58
+ raise(Tabular::Errors::TypeMismatch, "Format is :fixed. Invalid parse input: #{line.class.name}")
59
+ end
60
+
61
+ hash = {}
62
+ index = 0
63
+ fixed_format.each do |map|
64
+ value = line[index..(index + map.size - 1)]
65
+ index += map.size
66
+ hash[map.key] = encoding.nil? ? value.strip : value.strip.encode(encoding, encoding_options)
67
+ end
68
+ hash
69
+ end
70
+
71
+ private
72
+
73
+ FixedFormat = Struct.new(:key, :size)
74
+
75
+ # Returns [Array<FixedFormat>] the format for this fixed width file.
76
+ # Also validates values
77
+ def parse_format(format)
78
+ format.collect do |map|
79
+ size = map[:size]
80
+ key = map[:key]
81
+ raise(ArgumentError, "Missing required :key and :size in: #{map.inspect}") unless size && key
82
+ FixedFormat.new(key, size)
83
+ end
84
+ end
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,21 @@
1
+ require 'json'
2
+ module IOStreams
3
+ class Tabular
4
+ module Parser
5
+ class Hash < Base
6
+ def parse(row)
7
+ raise(Tabular::Errors::TypeMismatch, "Format is :hash. Invalid input: #{row.class.name}") unless row.is_a?(::Hash)
8
+ row
9
+ end
10
+
11
+ def render(row, header)
12
+ header.to_hash(row)
13
+ end
14
+
15
+ def requires_header?
16
+ false
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,25 @@
1
+ require 'json'
2
+ module IOStreams
3
+ class Tabular
4
+ module Parser
5
+ # For parsing a single line of JSON at a time
6
+ class Json < Base
7
+ def parse(row)
8
+ raise(Tabular::Errors::TypeMismatch, "Format is :json. Invalid input: #{row.class.name}") unless row.is_a?(String)
9
+
10
+ JSON.parse(row)
11
+ end
12
+
13
+ # Return the supplied array as a single line JSON string.
14
+ def render(row, header)
15
+ hash = header.to_hash(row)
16
+ hash.to_json
17
+ end
18
+
19
+ def requires_header?
20
+ false
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,34 @@
1
+ module IOStreams
2
+ class Tabular
3
+ module Parser
4
+ # For parsing a single line of Pipe-separated values
5
+ class Psv < Base
6
+ # Returns [Array<String>] the header row.
7
+ # Returns nil if the row is blank.
8
+ def parse_header(row)
9
+ unless row.is_a?(String)
10
+ raise(Tabular::Errors::InvalidHeader, "Format is :psv. Invalid input header: #{row.class.name}")
11
+ end
12
+
13
+ row.split('|')
14
+ end
15
+
16
+ # Returns [Array] the parsed PSV line
17
+ def parse(row)
18
+ raise(Tabular::Errors::TypeMismatch, "Format is :psv. Invalid input: #{row.class.name}") unless row.is_a?(String)
19
+
20
+ row.split('|')
21
+ end
22
+
23
+ # Return the supplied array as a single line JSON string.
24
+ def render(row, header)
25
+ array = header.to_array(row)
26
+ cleansed_array = array.collect do |i|
27
+ i.is_a?(String) ? i.tr('|', ':') : i
28
+ end
29
+ cleansed_array.join('|')
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,115 @@
1
+ require 'csv'
2
+ module IOStreams
3
+ class Tabular
4
+ module Utility
5
+ # For parsing a single line of CSV at a time
6
+ # 2 to 3 times better performance than CSV.parse_line and considerably less
7
+ # garbage collection required.
8
+ #
9
+ # Note:
10
+ # This parser does not support line feeds embedded in quoted fields since
11
+ # the file is broken apart based on line feeds during the upload process and
12
+ # is then processed by each worker on a line by line basis.
13
+ class CSVRow < ::CSV
14
+ UTF8_ENCODING = Encoding.find('UTF-8').freeze
15
+
16
+ def initialize(encoding = UTF8_ENCODING)
17
+ @io = StringIO.new(''.force_encoding(encoding))
18
+ super(@io, row_sep: '')
19
+ end
20
+
21
+ # Parse a single line of CSV data
22
+ # Parameters
23
+ # line [String]
24
+ # A single line of CSV data without any line terminators
25
+ def parse(line)
26
+ return if IOStreams.blank?(line)
27
+ return if @skip_lines and @skip_lines.match line
28
+
29
+ in_extended_col = false
30
+ csv = Array.new
31
+ parts = line.split(@col_sep, -1)
32
+ csv << nil if parts.empty?
33
+
34
+ # This loop is the hot path of csv parsing. Some things may be non-dry
35
+ # for a reason. Make sure to benchmark when refactoring.
36
+ parts.each do |part|
37
+ if in_extended_col
38
+ # If we are continuing a previous column
39
+ if part[-1] == @quote_char && part.count(@quote_char) % 2 != 0
40
+ # extended column ends
41
+ csv.last << part[0..-2]
42
+ if csv.last =~ @parsers[:stray_quote]
43
+ raise MalformedCSVError, "Missing or stray quote in line #{lineno + 1}"
44
+ end
45
+ csv.last.gsub!(@quote_char * 2, @quote_char)
46
+ in_extended_col = false
47
+ else
48
+ csv.last << part
49
+ csv.last << @col_sep
50
+ end
51
+ elsif part[0] == @quote_char
52
+ # If we are starting a new quoted column
53
+ if part[-1] != @quote_char || part.count(@quote_char) % 2 != 0
54
+ # start an extended column
55
+ csv << part[1..-1]
56
+ csv.last << @col_sep
57
+ in_extended_col = true
58
+ else
59
+ # regular quoted column
60
+ csv << part[1..-2]
61
+ if csv.last =~ @parsers[:stray_quote]
62
+ raise MalformedCSVError, "Missing or stray quote in line #{lineno + 1}"
63
+ end
64
+ csv.last.gsub!(@quote_char * 2, @quote_char)
65
+ end
66
+ elsif part =~ @parsers[:quote_or_nl]
67
+ # Unquoted field with bad characters.
68
+ if part =~ @parsers[:nl_or_lf]
69
+ raise MalformedCSVError, "Unquoted fields do not allow \\r or \\n (line #{lineno + 1})."
70
+ else
71
+ raise MalformedCSVError, "Illegal quoting in line #{lineno + 1}."
72
+ end
73
+ else
74
+ # Regular ole unquoted field.
75
+ csv << (part.empty? ? nil : part)
76
+ end
77
+ end
78
+
79
+ # Replace tacked on @col_sep with @row_sep if we are still in an extended
80
+ # column.
81
+ csv[-1][-1] = @row_sep if in_extended_col
82
+
83
+ if in_extended_col
84
+ raise MalformedCSVError, "Unclosed quoted field on line #{lineno + 1}."
85
+ end
86
+
87
+ @lineno += 1
88
+
89
+ # save fields unconverted fields, if needed...
90
+ unconverted = csv.dup if @unconverted_fields
91
+
92
+ # convert fields, if needed...
93
+ csv = convert_fields(csv) unless @use_headers or @converters.empty?
94
+ # parse out header rows and handle CSV::Row conversions...
95
+ csv = parse_headers(csv) if @use_headers
96
+
97
+ # inject unconverted fields and accessor, if requested...
98
+ if @unconverted_fields and not csv.respond_to? :unconverted_fields
99
+ add_unconverted_fields(csv, unconverted)
100
+ end
101
+
102
+ csv
103
+ end
104
+
105
+ # Return the supplied array as a single line CSV string.
106
+ def render(row)
107
+ row.map(&@quote).join(@col_sep) + @row_sep # quote and separate
108
+ end
109
+
110
+ alias_method :to_csv, :render
111
+
112
+ end
113
+ end
114
+ end
115
+ end
@@ -1,3 +1,3 @@
1
- module IOStreams #:nodoc
2
- VERSION = '0.14.0'
1
+ module IOStreams
2
+ VERSION = '0.15.0'
3
3
  end
@@ -25,7 +25,7 @@ module IOStreams
25
25
  file_name = temp_file.to_path
26
26
 
27
27
  ::File.open(file_name, 'wb') do |file|
28
- IOStreams.copy(file_name_or_io, file, buffer_size)
28
+ IOStreams.copy(file_name_or_io, file, buffer_size: buffer_size)
29
29
  end
30
30
  else
31
31
  file_name = file_name_or_io
@@ -33,7 +33,7 @@ module IOStreams
33
33
 
34
34
  # Stream zip stream into temp file
35
35
  ::File.open(file_name, 'wb') do |file|
36
- IOStreams.copy(file_name_or_io, file, buffer_size)
36
+ IOStreams.copy(file_name_or_io, file, buffer_size: buffer_size)
37
37
  end
38
38
 
39
39
  read_file(file_name, &block)
@@ -45,7 +45,7 @@ module IOStreams
45
45
  write_file(temp_file.to_path, zip_file_name, &block)
46
46
 
47
47
  # Stream temp file into output stream
48
- IOStreams.copy(temp_file, file_name_or_io, buffer_size)
48
+ IOStreams.copy(temp_file, file_name_or_io, buffer_size: buffer_size)
49
49
  ensure
50
50
  temp_file.delete if temp_file
51
51
  end
data/lib/iostreams.rb CHANGED
@@ -1,23 +1,23 @@
1
1
  require 'io_streams/version'
2
2
  #@formatter:off
3
3
  module IOStreams
4
- module CSV
5
- autoload :Reader, 'io_streams/csv/reader'
6
- autoload :Writer, 'io_streams/csv/writer'
4
+ module Bzip2
5
+ autoload :Reader, 'io_streams/bzip2/reader'
6
+ autoload :Writer, 'io_streams/bzip2/writer'
7
7
  end
8
8
  module File
9
9
  autoload :Reader, 'io_streams/file/reader'
10
10
  autoload :Writer, 'io_streams/file/writer'
11
11
  end
12
- module Bzip2
13
- autoload :Reader, 'io_streams/bzip2/reader'
14
- autoload :Writer, 'io_streams/bzip2/writer'
15
- end
16
12
  module Gzip
17
13
  autoload :Reader, 'io_streams/gzip/reader'
18
14
  autoload :Writer, 'io_streams/gzip/writer'
19
15
  end
20
16
  autoload :Pgp, 'io_streams/pgp'
17
+ module S3
18
+ autoload :Reader, 'io_streams/s3/reader'
19
+ autoload :Writer, 'io_streams/s3/writer'
20
+ end
21
21
  module SFTP
22
22
  autoload :Reader, 'io_streams/sftp/reader'
23
23
  autoload :Writer, 'io_streams/sftp/writer'
@@ -26,12 +26,23 @@ module IOStreams
26
26
  autoload :Reader, 'io_streams/zip/reader'
27
27
  autoload :Writer, 'io_streams/zip/writer'
28
28
  end
29
- module Delimited
30
- autoload :Reader, 'io_streams/delimited/reader'
31
- autoload :Writer, 'io_streams/delimited/writer'
29
+
30
+ module Line
31
+ autoload :Reader, 'io_streams/line/reader'
32
+ autoload :Writer, 'io_streams/line/writer'
33
+ end
34
+ module Record
35
+ autoload :Reader, 'io_streams/record/reader'
36
+ autoload :Writer, 'io_streams/record/writer'
37
+ end
38
+ module Row
39
+ autoload :Reader, 'io_streams/row/reader'
40
+ autoload :Writer, 'io_streams/row/writer'
32
41
  end
33
42
  module Xlsx
34
43
  autoload :Reader, 'io_streams/xlsx/reader'
35
44
  end
45
+
46
+ autoload :Tabular, 'io_streams/tabular'
36
47
  end
37
48
  require 'io_streams/io_streams'
@@ -1,33 +1,32 @@
1
1
  require_relative 'test_helper'
2
2
 
3
- # Unit Test for IOStreams::Gzip
4
- module Streams
5
- class Bzip2ReaderTest < Minitest::Test
6
- describe IOStreams::Bzip2::Reader do
7
- before do
8
- @file_name = File.join(File.dirname(__FILE__), 'files', 'text.txt.bz2')
9
- @gzip_data = File.open(@file_name, 'rb') { |f| f.read }
10
- @data = File.read(File.join(File.dirname(__FILE__), 'files', 'text.txt'))
11
- end
3
+ class Bzip2ReaderTest < Minitest::Test
4
+ describe IOStreams::Bzip2::Reader do
5
+ let :file_name do
6
+ File.join(File.dirname(__FILE__), 'files', 'text.txt.bz2')
7
+ end
12
8
 
13
- describe '.open' do
14
- it 'file' do
15
- result = IOStreams::Bzip2::Reader.open(@file_name) do |io|
16
- io.read
17
- end
18
- assert_equal @data, result
9
+ let :decompressed do
10
+ File.read(File.join(File.dirname(__FILE__), 'files', 'text.txt'))
11
+ end
12
+
13
+ describe '.open' do
14
+ it 'file' do
15
+ result = IOStreams::Bzip2::Reader.open(file_name) do |io|
16
+ io.read
19
17
  end
18
+ assert_equal decompressed, result
19
+ end
20
20
 
21
- it 'stream' do
22
- result = File.open(@file_name) do |file|
23
- IOStreams::Bzip2::Reader.open(file) do |io|
24
- io.read
25
- end
21
+ it 'stream' do
22
+ result = File.open(file_name) do |file|
23
+ IOStreams::Bzip2::Reader.open(file) do |io|
24
+ io.read
26
25
  end
27
- assert_equal @data, result
28
26
  end
27
+ assert_equal decompressed, result
29
28
  end
30
-
31
29
  end
30
+
32
31
  end
33
32
  end