fixed_width_file_parser 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/fixed_width_file_parser/version.rb +1 -1
- data/lib/fixed_width_file_parser.rb +59 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a18ff4c3a7378536cc1b8a24dcc46397e9d2153b
|
4
|
+
data.tar.gz: faf99597df832a4e34870111334bd980c485a06e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8fbaad05477bd91aff65e0d41cdace701bd20d71bf57e4c24f85a89d36d8774f3f6891df85290d6f989a34985158397adaaf9d423b44dd507520414218693f09
|
7
|
+
data.tar.gz: de10641a19c7d39f04dc8cb988b927a109a19f62a99b099eedf2f8d41b0b3fb7d6fe18ec538e55e125e5a7b2190690ff61467f871e56a57f3a6d22a736cae7bd
|
@@ -77,4 +77,63 @@ module FixedWidthFileParser
|
|
77
77
|
|
78
78
|
file.close
|
79
79
|
end
|
80
|
+
|
81
|
+
def self.parse_in_batches(filepath, fields, options = {})
|
82
|
+
# Set options, or use default
|
83
|
+
batch_size = options.fetch(:batch_size, 1000)
|
84
|
+
force_utf8_encoding = options.fetch(:force_utf8_encoding, true)
|
85
|
+
|
86
|
+
# Verify `filepath` is a String
|
87
|
+
unless filepath.is_a?(String)
|
88
|
+
raise '`filepath` must be a String'
|
89
|
+
end
|
90
|
+
|
91
|
+
# Verify `fields` is an array
|
92
|
+
if fields.is_a?(Array)
|
93
|
+
# Verify fields is not emtpy
|
94
|
+
if fields.empty?
|
95
|
+
raise '`fields` must contain at least 1 item'
|
96
|
+
end
|
97
|
+
else
|
98
|
+
raise '`fields` must be an Array'
|
99
|
+
end
|
100
|
+
|
101
|
+
# Verify each field has a `name` and `position`
|
102
|
+
unless fields.all? { |item| item.key?(:name) && item.key?(:position) }
|
103
|
+
raise 'Each field hash must include a `name` and a `position`'
|
104
|
+
end
|
105
|
+
|
106
|
+
# Verify that each `position` is either a Range or an Integer
|
107
|
+
unless fields.all? { |item| item[:position].is_a?(Range) || item[:position].is_a?(Integer) }
|
108
|
+
raise "Each field's `position` must be a Range or an Integer"
|
109
|
+
end
|
110
|
+
|
111
|
+
GC.start
|
112
|
+
|
113
|
+
File.open(filepath) do |file|
|
114
|
+
file.lazy.drop(1).each_slice(batch_size) do |lines|
|
115
|
+
lines.each do |line|
|
116
|
+
# If the current line is blank, skip to the next line
|
117
|
+
# chomp to remove "\n" and "\r\n"
|
118
|
+
next if line.chomp.empty?
|
119
|
+
|
120
|
+
# Force UTF8 encoding if force_utf8_encoding is true (defaults to true)
|
121
|
+
if force_utf8_encoding
|
122
|
+
# Handle UTF Invalid Byte Sequence Errors
|
123
|
+
# e.g. https://robots.thoughtbot.com/fight-back-utf-8-invalid-byte-sequences
|
124
|
+
line = line.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
|
125
|
+
end
|
126
|
+
|
127
|
+
line_fields = {}
|
128
|
+
fields.each do |field|
|
129
|
+
line_fields[field[:name].to_sym] = line[ field[:position] ].nil? ? nil : line[ field[:position] ].strip
|
130
|
+
end
|
131
|
+
|
132
|
+
yield(line_fields)
|
133
|
+
end
|
134
|
+
|
135
|
+
GC.start
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
80
139
|
end
|