fixed_width_file_parser 0.1.6 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/fixed_width_file_parser/version.rb +1 -1
- data/lib/fixed_width_file_parser.rb +59 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a18ff4c3a7378536cc1b8a24dcc46397e9d2153b
|
4
|
+
data.tar.gz: faf99597df832a4e34870111334bd980c485a06e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8fbaad05477bd91aff65e0d41cdace701bd20d71bf57e4c24f85a89d36d8774f3f6891df85290d6f989a34985158397adaaf9d423b44dd507520414218693f09
|
7
|
+
data.tar.gz: de10641a19c7d39f04dc8cb988b927a109a19f62a99b099eedf2f8d41b0b3fb7d6fe18ec538e55e125e5a7b2190690ff61467f871e56a57f3a6d22a736cae7bd
|
@@ -77,4 +77,63 @@ module FixedWidthFileParser
|
|
77
77
|
|
78
78
|
file.close
|
79
79
|
end
|
80
|
+
|
81
|
+
def self.parse_in_batches(filepath, fields, options = {})
|
82
|
+
# Set options, or use default
|
83
|
+
batch_size = options.fetch(:batch_size, 1000)
|
84
|
+
force_utf8_encoding = options.fetch(:force_utf8_encoding, true)
|
85
|
+
|
86
|
+
# Verify `filepath` is a String
|
87
|
+
unless filepath.is_a?(String)
|
88
|
+
raise '`filepath` must be a String'
|
89
|
+
end
|
90
|
+
|
91
|
+
# Verify `fields` is an array
|
92
|
+
if fields.is_a?(Array)
|
93
|
+
# Verify fields is not emtpy
|
94
|
+
if fields.empty?
|
95
|
+
raise '`fields` must contain at least 1 item'
|
96
|
+
end
|
97
|
+
else
|
98
|
+
raise '`fields` must be an Array'
|
99
|
+
end
|
100
|
+
|
101
|
+
# Verify each field has a `name` and `position`
|
102
|
+
unless fields.all? { |item| item.key?(:name) && item.key?(:position) }
|
103
|
+
raise 'Each field hash must include a `name` and a `position`'
|
104
|
+
end
|
105
|
+
|
106
|
+
# Verify that each `position` is either a Range or an Integer
|
107
|
+
unless fields.all? { |item| item[:position].is_a?(Range) || item[:position].is_a?(Integer) }
|
108
|
+
raise "Each field's `position` must be a Range or an Integer"
|
109
|
+
end
|
110
|
+
|
111
|
+
GC.start
|
112
|
+
|
113
|
+
File.open(filepath) do |file|
|
114
|
+
file.lazy.drop(1).each_slice(batch_size) do |lines|
|
115
|
+
lines.each do |line|
|
116
|
+
# If the current line is blank, skip to the next line
|
117
|
+
# chomp to remove "\n" and "\r\n"
|
118
|
+
next if line.chomp.empty?
|
119
|
+
|
120
|
+
# Force UTF8 encoding if force_utf8_encoding is true (defaults to true)
|
121
|
+
if force_utf8_encoding
|
122
|
+
# Handle UTF Invalid Byte Sequence Errors
|
123
|
+
# e.g. https://robots.thoughtbot.com/fight-back-utf-8-invalid-byte-sequences
|
124
|
+
line = line.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
|
125
|
+
end
|
126
|
+
|
127
|
+
line_fields = {}
|
128
|
+
fields.each do |field|
|
129
|
+
line_fields[field[:name].to_sym] = line[ field[:position] ].nil? ? nil : line[ field[:position] ].strip
|
130
|
+
end
|
131
|
+
|
132
|
+
yield(line_fields)
|
133
|
+
end
|
134
|
+
|
135
|
+
GC.start
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
80
139
|
end
|