dreader 1.2.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.org +7 -12
- data/README.org +54 -52
- data/dreader.gemspec +2 -2
- data/examples/age/ages.txt +10 -0
- data/examples/template/birthdays.xlsx +0 -0
- data/lib/dreader/column.rb +2 -0
- data/lib/dreader/engine.rb +52 -88
- data/lib/dreader/options.rb +10 -0
- data/lib/dreader/util.rb +21 -0
- data/lib/dreader/version.rb +3 -1
- data/lib/dreader.rb +2 -1
- metadata +7 -16
- data/examples/age_csv/Birthdays-TabSeparated.csv +0 -13
- data/examples/age_csv/Birthdays.csv +0 -13
- data/examples/age_csv/age.rb +0 -55
- data/examples/age_noext/Birthdays +0 -0
- data/examples/age_noext/Birthdays-xlsx +0 -0
- data/examples/age_noext/Birthdays-xlsx-with-wrong-extension.xls +0 -0
- data/examples/age_noext/age.rb +0 -73
- data/examples/wikipedia_us_cities/us_cities_reject.rb +0 -77
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 520ff1f682a1b747037ccc7fb1aa13d0619dad6118690552897471cbaf53a580
|
4
|
+
data.tar.gz: 10feef9edfc5511527aecbbcc297ecdb2a05b3f2f8f0268038bc41a55974bf17
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 80e982f26d152b30ff25d57180d97139f97d40423ab623754d7c514a42f5e69fbd06f7f882fe85034880f7a3dd4a48e89c63b362dedd5f9dd2e955c4125036f0
|
7
|
+
data.tar.gz: 0df7a5c61ce2a4f72f076fdfa44a2487bd22d1f1b86862aba4cf6b3f858eb5d632a4237cd85be50f262726cf2001d98a6842c1bda157cab8046b93b60e2f7061
|
data/CHANGELOG.org
CHANGED
@@ -1,19 +1,14 @@
|
|
1
1
|
#+TITLE: Changelog
|
2
2
|
|
3
|
-
* Version 1.2.
|
4
|
-
** reject declaration
|
3
|
+
* Version 1.2.1 - <2025-08-26 Tue>
|
5
4
|
|
6
|
-
|
7
|
-
|
8
|
-
|
5
|
+
- Back to Github
|
6
|
+
- Fixes
|
7
|
+
|
8
|
+
* Version 1.2.0 - <2023-12-29 Fri>
|
9
9
|
|
10
|
-
|
11
|
-
**
|
12
|
-
|
13
|
-
- Fixes a bug related to =:extension= and adds a working example, to test
|
14
|
-
the feature
|
15
|
-
- Changes the extension from a string to a symbol. No initial dot required
|
16
|
-
any longer
|
10
|
+
** Adds support for type in columns
|
11
|
+
** (Developer) Removes Rubocop Warning
|
17
12
|
|
18
13
|
* Version 1.1.1 - <2023-10-16 Mon>
|
19
14
|
** Adds option :extension
|
data/README.org
CHANGED
@@ -137,8 +137,7 @@ To write an import function with Dreader:
|
|
137
137
|
and check parsed data
|
138
138
|
- Add virtual columns, that is, columns computed from other values
|
139
139
|
in the row
|
140
|
-
- Specify
|
141
|
-
- Specify how to transform lines. This is where you do the actual work
|
140
|
+
- Specify how to map line. This is where you do the actual work
|
142
141
|
(for instance, if you process a file line by line) or put together data for
|
143
142
|
processing after the file has been fully read --- see the next step.
|
144
143
|
|
@@ -166,13 +165,12 @@ Require =dreader= and declare a class which extends =Dreader::Engine=:
|
|
166
165
|
end
|
167
166
|
#+END_EXAMPLE
|
168
167
|
|
169
|
-
|
168
|
+
In the class specify parsing option, using the following syntax:
|
170
169
|
|
171
170
|
#+BEGIN_EXAMPLE ruby
|
172
171
|
options do
|
173
172
|
filename 'example.ods'
|
174
|
-
|
175
|
-
extension :ods
|
173
|
+
extension ".ods"
|
176
174
|
|
177
175
|
sheet 'Sheet 1'
|
178
176
|
|
@@ -192,10 +190,10 @@ where:
|
|
192
190
|
to supply a filename when loading the file (see =read=, below). *Use
|
193
191
|
=.tsv= for tab-separated files.*
|
194
192
|
- (optional) =extension= overrides or specify the extension of =filename=.
|
195
|
-
Takes as input a
|
196
|
-
|
197
|
-
|
198
|
-
|
193
|
+
Takes as input the extension preceded by a "." (e.g., ".xlsx"). Notice that
|
194
|
+
**value of this option is not appended to filename** (see =read= below).
|
195
|
+
Filename must thus be a valid reference to a file in the file system. This
|
196
|
+
option is useful in one of these two circumstances:
|
199
197
|
1. When =filename= has no extension
|
200
198
|
2. When you want to override the extension of the filename, e.g., to force
|
201
199
|
reading a "file.csv" as a tab separated file
|
@@ -205,6 +203,10 @@ where:
|
|
205
203
|
will rely on =roo= to determine the last row. This is useful for
|
206
204
|
those files in which you only want to process some of the content or
|
207
205
|
contain "garbage" after the records.
|
206
|
+
- (optional) =date_format= specifies the date format, using the notation
|
207
|
+
understood by =strptime=. It is used only when the column declaration
|
208
|
+
contains a type specification (e.g., the column declaration of one or more
|
209
|
+
columns is in the form =[<column>, :date]=
|
208
210
|
- (optional) =sheet= is the sheet name or number to read from. If not
|
209
211
|
specified, the first (default) sheet is used
|
210
212
|
- (optional) =debug= specifies that we are debugging
|
@@ -244,18 +246,43 @@ There are two notations:
|
|
244
246
|
The reference to a column can either be a letter or a number. First column
|
245
247
|
is ='A'= or =1=.
|
246
248
|
|
247
|
-
|
249
|
+
Optionally, the reference to the column can be an array. In this case, the
|
250
|
+
first element of the array is the reference to the column and the second
|
251
|
+
argument being its type, that is, any of =:integer, :float,
|
252
|
+
:big_decimal, :date=:
|
248
253
|
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
254
|
+
#+begin_example ruby
|
255
|
+
# First notation, colref is put in the block
|
256
|
+
column({ name: ['A', :date] })
|
257
|
+
#+end_example
|
258
|
+
|
259
|
+
The effect of this declaration is introducing a =process= directive which
|
260
|
+
takes care of converting the input into the declared type. That is, the
|
261
|
+
notation above is a shortcut for:
|
262
|
+
|
263
|
+
#+begin_example ruby
|
264
|
+
# First notation, colref is put in the block
|
265
|
+
column({ name: 'A' } do
|
266
|
+
process { |value| Date.strptime(value, <the value of the option date_format>) }
|
267
|
+
end
|
268
|
+
#+end_example
|
269
|
+
|
270
|
+
The =column= declaration can contain various Ruby blocks:
|
271
|
+
|
272
|
+
- one or more =check_raw= block. The =check_raw= blocks are run in sequence,
|
273
|
+
to check data as read from the input file. They can be used, for instance,
|
274
|
+
to verify presence of a value in the input file.
|
275
|
+
*Check must return true if there are no errors: any other value (e.g. an
|
276
|
+
array of messages) is considered an error.*
|
253
277
|
- =process= can be used to transform data into something closer to the input
|
254
278
|
data required for the importing (e.g., it can be used for downcase or
|
255
279
|
strip a string)
|
256
|
-
- one or more =check= block
|
257
|
-
|
258
|
-
|
280
|
+
- one or more =check= block. The =check= blocks are run in sequence on the
|
281
|
+
processed data (that is the output of =process=, to check for errors. They
|
282
|
+
can be used, for instance, to check that a model built with =process= is
|
283
|
+
valid.
|
284
|
+
*Check must return true if there are no errors: any other value
|
285
|
+
(e.g. an array of messages) is considered an error.*
|
259
286
|
|
260
287
|
#+begin_example
|
261
288
|
column({ name: 'A' }) do
|
@@ -266,9 +293,9 @@ The =column= declaration can contain Ruby blocks:
|
|
266
293
|
#+end_example
|
267
294
|
|
268
295
|
#+begin_quote
|
269
|
-
|
270
|
-
|
271
|
-
|
296
|
+
If you declare more than a check block of the same type per column, use a
|
297
|
+
unique symbol to distinguish the blocks or the error messages will be
|
298
|
+
overwritten.
|
272
299
|
#+end_quote
|
273
300
|
|
274
301
|
#+begin_example
|
@@ -399,10 +426,6 @@ See [[file:examples/wikipedia_us_cities/us_cities_bulk_declare.rb][us_cities_bul
|
|
399
426
|
hash from the code block.
|
400
427
|
#+END_NOTES
|
401
428
|
|
402
|
-
The data read from each row of our input data is stored in a hash. The hash
|
403
|
-
uses column names as the primary key and stores the values in the =:value=
|
404
|
-
key.
|
405
|
-
|
406
429
|
*** Add virtual columns
|
407
430
|
|
408
431
|
Sometimes it is convenient to aggregate or otherwise manipulate the data
|
@@ -431,22 +454,6 @@ Virtual columns are, of course, available to the =mapping= directive
|
|
431
454
|
(see below).
|
432
455
|
|
433
456
|
|
434
|
-
*** Specify which lines to reject
|
435
|
-
|
436
|
-
You can reject some lines using the =reject= declaration, which is applied row
|
437
|
-
by row, can predicate over columns and virtual columns, and has to return a
|
438
|
-
Boolean value.
|
439
|
-
|
440
|
-
All lines returning a truish value will be be rejected, that is, not stored in
|
441
|
-
the =@table= variable (and, consequently, passed to the mapping function).
|
442
|
-
|
443
|
-
For instance, the following declaration rejects all lines in which the
|
444
|
-
population column is higher than =3_000_000=:
|
445
|
-
|
446
|
-
#+begin_src ruby
|
447
|
-
reject { |row| row[:population][:value] > 3_000_000 }
|
448
|
-
#+end_src
|
449
|
-
|
450
457
|
*** Specify how to process each line
|
451
458
|
|
452
459
|
The =mapping= directive specifies what to do with each line read. The
|
@@ -462,9 +469,10 @@ value of column =:age= and prints them to standard output
|
|
462
469
|
end
|
463
470
|
#+END_EXAMPLE
|
464
471
|
|
465
|
-
|
466
|
-
|
467
|
-
|
472
|
+
The data read from each row of our input data is stored in a hash. The hash
|
473
|
+
uses column names as the primary key and stores the values in the =:value=
|
474
|
+
key.
|
475
|
+
|
468
476
|
|
469
477
|
*** Process data
|
470
478
|
|
@@ -484,8 +492,8 @@ A typical scenario works as follows:
|
|
484
492
|
# examples:
|
485
493
|
# i.read
|
486
494
|
# i.read filename: "example.ods"
|
487
|
-
# i.read filename: "example.ods", extension:
|
488
|
-
# i.read filename: "example", extension:
|
495
|
+
# i.read filename: "example.ods", extension: ".ods"
|
496
|
+
# i.read filename: "example", extension: ".ods"
|
489
497
|
# (the line above opens the file "example" as an Open Document Spreasdheet)
|
490
498
|
i.read
|
491
499
|
|
@@ -520,13 +528,7 @@ A typical scenario works as follows:
|
|
520
528
|
(Optionally: check again for errors.)
|
521
529
|
|
522
530
|
5. Add your own code to process the data returned after =mappings=, which you
|
523
|
-
can
|
524
|
-
with =i.table= or =i.data= (synonyms).
|
525
|
-
|
526
|
-
#+begin_quote
|
527
|
-
Notice that =mappings= does a side effect and invoking the mapping twice in a
|
528
|
-
row won't work: you need to reload the file first.
|
529
|
-
#+end_quote
|
531
|
+
can access with =i.table= or =i.data= (synonyms).
|
530
532
|
|
531
533
|
Look in the examples directory for further details and a couple of working
|
532
534
|
examples.
|
data/dreader.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.authors = ["Adolfo Villafiorita"]
|
10
10
|
spec.email = ["adolfo@shair.tech"]
|
11
11
|
|
12
|
-
spec.summary = %q{
|
12
|
+
spec.summary = %q{Porcelain on top of Roo for declarative importing of CSV and spreadheet files}
|
13
13
|
spec.description = %q{Use this gem to specify the structure of some tabular data
|
14
14
|
you want to process. The input data can be in CSV, LibreOffice, and Excel. Each row
|
15
15
|
can then be passed to a block of code you define.
|
@@ -19,7 +19,7 @@ Rails application, but the gem can used in any Ruby application.
|
|
19
19
|
|
20
20
|
The gem should be relatively easy to use, despite its name. (Dread
|
21
21
|
stands for *d*ata *r*eader)}
|
22
|
-
spec.homepage = "https://
|
22
|
+
spec.homepage = "https://https://github.com/avillafiorita/dreader"
|
23
23
|
spec.license = "MIT"
|
24
24
|
|
25
25
|
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
Binary file
|
data/lib/dreader/column.rb
CHANGED
data/lib/dreader/engine.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal:true
|
2
|
+
|
1
3
|
require "roo"
|
2
4
|
require "logger"
|
3
5
|
require "fast_excel"
|
@@ -10,6 +12,8 @@ module Dreader
|
|
10
12
|
#
|
11
13
|
# This is where the real stuff begins
|
12
14
|
#
|
15
|
+
# TODO: FIX Metric?
|
16
|
+
# rubocop:disable Module/ModuleLength
|
13
17
|
module Engine
|
14
18
|
# the options we passed
|
15
19
|
attr_accessor :declared_options
|
@@ -21,9 +25,7 @@ module Dreader
|
|
21
25
|
attr_accessor :declared_virtual_columns
|
22
26
|
# the mapping rules
|
23
27
|
attr_accessor :declared_mapping
|
24
|
-
|
25
|
-
attr_accessor :declared_reject
|
26
|
-
|
28
|
+
|
27
29
|
# the data we read
|
28
30
|
attr_reader :table
|
29
31
|
|
@@ -48,13 +50,13 @@ module Dreader
|
|
48
50
|
|
49
51
|
@declared_columns ||= []
|
50
52
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
53
|
+
@declared_columns << (
|
54
|
+
if name.instance_of?(Hash)
|
55
|
+
columns(name, &block)
|
56
|
+
else
|
57
|
+
column.to_hash.merge({ name: })
|
58
|
+
end
|
59
|
+
)
|
58
60
|
end
|
59
61
|
|
60
62
|
# define a DSL for multiple column specification (bulk_declare)
|
@@ -62,7 +64,7 @@ module Dreader
|
|
62
64
|
# - hash is a hash in the form { symbolic_name: colref }
|
63
65
|
#
|
64
66
|
# i.bulk_declare {name: "B", age: "C"} is equivalent to:
|
65
|
-
#
|
67
|
+
#
|
66
68
|
# i.column :name do
|
67
69
|
# colref "B"
|
68
70
|
# end
|
@@ -91,9 +93,18 @@ module Dreader
|
|
91
93
|
# end
|
92
94
|
# end
|
93
95
|
def columns(hash, &block)
|
94
|
-
hash.
|
96
|
+
hash.each do |key, value|
|
95
97
|
column = Column.new
|
96
|
-
|
98
|
+
|
99
|
+
if value.instance_of?(Array)
|
100
|
+
column.colref value[0]
|
101
|
+
column.process do |string|
|
102
|
+
Util.convert(string, value[1], @declared_options)
|
103
|
+
end
|
104
|
+
else
|
105
|
+
column.colref value
|
106
|
+
end
|
107
|
+
|
97
108
|
column.instance_eval(&block) if block
|
98
109
|
|
99
110
|
@declared_columns ||= []
|
@@ -114,15 +125,10 @@ module Dreader
|
|
114
125
|
# they are defined
|
115
126
|
def virtual_column(name, &block)
|
116
127
|
column = Column.new
|
117
|
-
column.instance_eval
|
128
|
+
column.instance_eval(&block)
|
118
129
|
|
119
130
|
@declared_virtual_columns ||= []
|
120
|
-
@declared_virtual_columns << column.to_hash.merge({ name:
|
121
|
-
end
|
122
|
-
|
123
|
-
# define a filter, which skips some rows
|
124
|
-
def reject(&block)
|
125
|
-
@declared_reject = block
|
131
|
+
@declared_virtual_columns << column.to_hash.merge({ name: })
|
126
132
|
end
|
127
133
|
|
128
134
|
# define what we do with each line we read
|
@@ -194,13 +200,8 @@ module Dreader
|
|
194
200
|
# this has side-effects on r
|
195
201
|
virtual_columns_on(r) if options[:virtual] || options[:mapping]
|
196
202
|
|
197
|
-
# check whether the filter would ignore this line
|
198
|
-
# notice that we need to invoke compact to avoid nil being added
|
199
|
-
# to the table
|
200
|
-
next if !options[:ignore_reject] && reject?(r)
|
201
|
-
|
202
203
|
options[:mapping] ? mappings_on(r) : r
|
203
|
-
end
|
204
|
+
end
|
204
205
|
end
|
205
206
|
|
206
207
|
# TODO: PASS A ROW (and not row_number and sheet)
|
@@ -227,10 +228,10 @@ module Dreader
|
|
227
228
|
coord = coord(row_number, colspec[:colref], cell)
|
228
229
|
begin
|
229
230
|
processed = colspec[:process] ? colspec[:process].call(cell) : cell
|
230
|
-
@logger.debug "[dreader] #{colname} process #{coord} yields '#{processed}' (#{processed.class})"
|
231
|
+
@logger.debug "[dreader] '#{colname}' process @ #{coord} yields '#{processed}' (#{processed.class})"
|
231
232
|
r[colname][:value] = processed
|
232
233
|
rescue => e
|
233
|
-
@logger.error "[dreader] #{colname} process #{coord} raises an exception"
|
234
|
+
@logger.error "[dreader] '#{colname}' process @ #{coord} raises an exception"
|
234
235
|
raise e
|
235
236
|
end
|
236
237
|
|
@@ -280,11 +281,10 @@ module Dreader
|
|
280
281
|
|
281
282
|
# Compute virtual columns for, with side effect on row
|
282
283
|
def virtual_columns_on(row)
|
283
|
-
@declared_virtual_columns ||= []
|
284
284
|
@declared_virtual_columns.each do |virtualcol|
|
285
285
|
colname = virtualcol[:name]
|
286
286
|
row[colname] = { virtual: true }
|
287
|
-
|
287
|
+
|
288
288
|
check_data(virtualcol[:checks_raw], row, colname, full_row: true)
|
289
289
|
|
290
290
|
begin
|
@@ -304,36 +304,13 @@ module Dreader
|
|
304
304
|
end
|
305
305
|
end
|
306
306
|
|
307
|
-
#
|
308
|
-
|
309
|
-
rejected = @declared_reject&.call(row)
|
310
|
-
if rejected
|
311
|
-
@logger.debug "[dreader] row rejected by reject declaration #{row}"
|
312
|
-
end
|
313
|
-
end
|
314
|
-
|
315
|
-
# apply the mapping code to the @table. Notice that we do a side effect
|
316
|
-
# on @table and, hence, invoking the mapping twice won't work (you need to
|
317
|
-
# reload first).
|
318
|
-
#
|
319
|
-
# the mapping is applied only if it defined and it returns the output of
|
320
|
-
# the mapping.
|
321
|
-
#
|
322
|
-
# notice also that we do a side-effect on @table. This is to make the
|
323
|
-
# behavior of
|
324
|
-
#
|
325
|
-
# i.load mapping: true
|
326
|
-
# i.table
|
307
|
+
# apply the mapping code to the array it makes sense to invoke it only
|
308
|
+
# once.
|
327
309
|
#
|
328
|
-
# and
|
329
|
-
#
|
330
|
-
# i = load;
|
331
|
-
# i.mappings
|
332
|
-
# i.table
|
333
|
-
#
|
334
|
-
# the same
|
310
|
+
# the mapping is applied only if it defined and it uses map, so that
|
311
|
+
# it can be used functionally
|
335
312
|
def mappings
|
336
|
-
@table
|
313
|
+
@table.map { |row| mappings_on(row) }
|
337
314
|
end
|
338
315
|
|
339
316
|
def mappings_on(row)
|
@@ -431,49 +408,36 @@ module Dreader
|
|
431
408
|
|
432
409
|
private
|
433
410
|
|
434
|
-
# list of keys we support in options. We remove them when reading
|
435
|
-
# the CSV file
|
436
|
-
OPTION_KEYS = %i[
|
437
|
-
filename extension sheet first_row last_row
|
438
|
-
logger logger_level
|
439
|
-
debug
|
440
|
-
]
|
441
|
-
|
442
411
|
def open_spreadsheet(options)
|
443
412
|
filename = options[:filename]
|
444
|
-
|
445
|
-
extension = options[:extension] || File.extname(filename).downcase[1..-1]&.to_sym
|
446
|
-
|
447
|
-
|
448
|
-
# TODO: MAKE DEBUG AND LOGGER INTO REAL CLASS VARIABLES OR MAKE LOCAL AND/OR FUNCTIONS
|
449
|
-
@debug = @declared_options.merge(options)[:debug] == true
|
450
|
-
if @debug
|
451
|
-
@logger = options[:logger] || Logger.new($stdout)
|
452
|
-
@logger.debug "[dreader open_spreadsheet] filename: #{filename}"
|
453
|
-
@logger.debug "[dreader open_spreadsheet] extension: #{extension}"
|
454
|
-
end
|
413
|
+
ext = options[:extension] || File.extname(filename)
|
455
414
|
|
456
|
-
case
|
457
|
-
when
|
458
|
-
csv_options = @declared_options.except(*
|
415
|
+
case ext
|
416
|
+
when ".csv"
|
417
|
+
csv_options = @declared_options.except(*Options::NON_CSV_KEYS)
|
459
418
|
Roo::CSV.new(filename, csv_options:)
|
460
|
-
when
|
461
|
-
csv_options = @declared_options.except(*
|
419
|
+
when ".tsv"
|
420
|
+
csv_options = @declared_options.except(*Options::NON_CSV_KEYS).merge({ col_sep: "\t" })
|
462
421
|
Roo::CSV.new(filename, csv_options:)
|
463
|
-
when
|
464
|
-
Roo::
|
422
|
+
when ".ods"
|
423
|
+
Roo::OpenOffice.new(filename)
|
424
|
+
when ".xls"
|
425
|
+
Roo::Excel.new(filename)
|
426
|
+
when ".xlsx"
|
427
|
+
Roo::Excelx.new(filename)
|
465
428
|
else
|
466
|
-
raise "Unknown extension: #{ext}
|
429
|
+
raise "Unknown extension: #{ext}"
|
467
430
|
end
|
468
431
|
end
|
469
432
|
|
470
433
|
def colref_to_i(colref)
|
471
434
|
return colref if colref.instance_of?(Integer)
|
435
|
+
|
472
436
|
value = 0
|
473
437
|
power = 1
|
474
438
|
colref.to_s.reverse.split("").map do |char|
|
475
|
-
value
|
476
|
-
power
|
439
|
+
value += power * (1 + char.ord - 'A'.ord)
|
440
|
+
power *= power
|
477
441
|
end
|
478
442
|
value - 1
|
479
443
|
end
|
@@ -496,7 +460,7 @@ module Dreader
|
|
496
460
|
#
|
497
461
|
# - debug :: a boolean
|
498
462
|
def check_data(check_spec, hash, colname, full_row: false)
|
499
|
-
check_spec.each do |error_message, check_function|
|
463
|
+
(check_spec || []).each do |error_message, check_function|
|
500
464
|
# here we extract values by distinguishing whether the hash is that of
|
501
465
|
# column or that of a row
|
502
466
|
if full_row
|
data/lib/dreader/options.rb
CHANGED
@@ -1,6 +1,16 @@
|
|
1
|
+
# frozen_string_literal:true
|
2
|
+
|
1
3
|
module Dreader
|
2
4
|
# service class to implement the options DSL language
|
3
5
|
class Options
|
6
|
+
# List of keys we support in options and which are not understood by the
|
7
|
+
# CSV reader
|
8
|
+
#
|
9
|
+
# We remove them when reading the CSV file
|
10
|
+
NON_CSV_KEYS = %i[
|
11
|
+
filename sheet first_row last_row logger logger_level date_format
|
12
|
+
].freeze
|
13
|
+
|
4
14
|
def initialize
|
5
15
|
@attributes = {}
|
6
16
|
end
|
data/lib/dreader/util.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Dreader
|
2
4
|
# Utilities function to simplify importing data into
|
3
5
|
# ActiveRecords
|
@@ -82,5 +84,24 @@ module Dreader
|
|
82
84
|
error[:row] == row && (col.nil? || error[:col] == col)
|
83
85
|
end
|
84
86
|
end
|
87
|
+
|
88
|
+
#
|
89
|
+
# Convert a string to a given type
|
90
|
+
#
|
91
|
+
def self.convert(value, type, options = {})
|
92
|
+
case type
|
93
|
+
when :integer
|
94
|
+
value.to_i
|
95
|
+
when :float
|
96
|
+
value.to_f
|
97
|
+
when :big_decimal
|
98
|
+
BigDecimal(value)
|
99
|
+
when :date
|
100
|
+
date_format = options[:date_format] || "%d/%m/%Y"
|
101
|
+
Date.strptime(value, date_format)
|
102
|
+
else
|
103
|
+
value
|
104
|
+
end
|
105
|
+
end
|
85
106
|
end
|
86
107
|
end
|
data/lib/dreader/version.rb
CHANGED
data/lib/dreader.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dreader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adolfo Villafiorita
|
8
|
-
autorequire:
|
9
8
|
bindir: exe
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: roo
|
@@ -108,34 +107,27 @@ files:
|
|
108
107
|
- dreader.gemspec
|
109
108
|
- examples/age/Birthdays.ods
|
110
109
|
- examples/age/age.rb
|
111
|
-
- examples/
|
112
|
-
- examples/age_csv/Birthdays.csv
|
113
|
-
- examples/age_csv/age.rb
|
114
|
-
- examples/age_noext/Birthdays
|
115
|
-
- examples/age_noext/Birthdays-xlsx
|
116
|
-
- examples/age_noext/Birthdays-xlsx-with-wrong-extension.xls
|
117
|
-
- examples/age_noext/age.rb
|
110
|
+
- examples/age/ages.txt
|
118
111
|
- examples/age_with_multiple_checks/Birthdays.ods
|
119
112
|
- examples/age_with_multiple_checks/age_with_multiple_checks.rb
|
120
113
|
- examples/local_vars/local_vars.rb
|
114
|
+
- examples/template/birthdays.xlsx
|
121
115
|
- examples/template/template_generation.rb
|
122
116
|
- examples/wikipedia_big_us_cities/big_us_cities.rb
|
123
117
|
- examples/wikipedia_big_us_cities/cities_by_state.ods
|
124
118
|
- examples/wikipedia_us_cities/us_cities.rb
|
125
119
|
- examples/wikipedia_us_cities/us_cities.tsv
|
126
120
|
- examples/wikipedia_us_cities/us_cities_bulk_declare.rb
|
127
|
-
- examples/wikipedia_us_cities/us_cities_reject.rb
|
128
121
|
- lib/dreader.rb
|
129
122
|
- lib/dreader/column.rb
|
130
123
|
- lib/dreader/engine.rb
|
131
124
|
- lib/dreader/options.rb
|
132
125
|
- lib/dreader/util.rb
|
133
126
|
- lib/dreader/version.rb
|
134
|
-
homepage: https://
|
127
|
+
homepage: https://https://github.com/avillafiorita/dreader
|
135
128
|
licenses:
|
136
129
|
- MIT
|
137
130
|
metadata: {}
|
138
|
-
post_install_message:
|
139
131
|
rdoc_options: []
|
140
132
|
require_paths:
|
141
133
|
- lib
|
@@ -150,8 +142,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
150
142
|
- !ruby/object:Gem::Version
|
151
143
|
version: '0'
|
152
144
|
requirements: []
|
153
|
-
rubygems_version: 3.
|
154
|
-
signing_key:
|
145
|
+
rubygems_version: 3.6.7
|
155
146
|
specification_version: 4
|
156
|
-
summary:
|
147
|
+
summary: Porcelain on top of Roo for declarative importing of CSV and spreadheet files
|
157
148
|
test_files: []
|
@@ -1,13 +0,0 @@
|
|
1
|
-
Name Date of birth
|
2
|
-
Forest Whitaker July 15, 1961
|
3
|
-
Daniel Day-Lewis April 29, 1957
|
4
|
-
Sean Penn August 17, 1960
|
5
|
-
Jeff Bridges December 4, 1949
|
6
|
-
Colin Firth September 10, 1960
|
7
|
-
Jean Dujardin June 19, 1972
|
8
|
-
Daniel Day-Lewis April 29, 1957
|
9
|
-
Matthew McConaughey November 4, 1969
|
10
|
-
Eddie Redmayne January 6, 1982
|
11
|
-
Leonardo DiCaprio November 11, 1974
|
12
|
-
Casey Affleck August 12, 1975
|
13
|
-
Gary Oldman March 21, 1958
|
@@ -1,13 +0,0 @@
|
|
1
|
-
Name,Date of birth
|
2
|
-
Forest Whitaker,"July 15, 1961"
|
3
|
-
Daniel Day-Lewis,"April 29, 1957"
|
4
|
-
Sean Penn,"August 17, 1960"
|
5
|
-
Jeff Bridges,"December 4, 1949"
|
6
|
-
Colin Firth,"September 10, 1960"
|
7
|
-
Jean Dujardin,"June 19, 1972"
|
8
|
-
Daniel Day-Lewis,"April 29, 1957"
|
9
|
-
Matthew McConaughey,"November 4, 1969"
|
10
|
-
Eddie Redmayne,"January 6, 1982"
|
11
|
-
Leonardo DiCaprio,"November 11, 1974"
|
12
|
-
Casey Affleck,"August 12, 1975"
|
13
|
-
Gary Oldman,"March 21, 1958"
|
data/examples/age_csv/age.rb
DELETED
@@ -1,55 +0,0 @@
|
|
1
|
-
require "dreader"
|
2
|
-
|
3
|
-
class Reader
|
4
|
-
extend Dreader::Engine
|
5
|
-
|
6
|
-
options do
|
7
|
-
first_row 2
|
8
|
-
debug true
|
9
|
-
end
|
10
|
-
|
11
|
-
column :name do
|
12
|
-
doc "A is the name string"
|
13
|
-
colref 'A'
|
14
|
-
end
|
15
|
-
|
16
|
-
column :birthdate do
|
17
|
-
doc "Birthdate contains a full date (i.e., including the year)"
|
18
|
-
colref 'B'
|
19
|
-
|
20
|
-
process do |c|
|
21
|
-
Date.parse(c)
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
virtual_column :age do
|
26
|
-
process do |row|
|
27
|
-
birthdate = row[:birthdate][:value]
|
28
|
-
birthday = Date.new(Date.today.year, birthdate.month, birthdate.day)
|
29
|
-
today = Date.today
|
30
|
-
|
31
|
-
[0, today.year - birthdate.year - (birthday < today ? 1 : 0)].max
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
mapping do |row|
|
36
|
-
r = Dreader::Util.simplify(row)
|
37
|
-
puts "#{r[:name]} is #{r[:age]} years old (born on #{r[:birthdate]})"
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
i = Reader
|
42
|
-
i.read filename: "Birthdays.csv", mapping: true
|
43
|
-
|
44
|
-
i.read filename: "Birthdays-TabSeparated.csv", extension: :tsv, mapping: true
|
45
|
-
|
46
|
-
#
|
47
|
-
# Here we can do further processing on the data
|
48
|
-
#
|
49
|
-
File.open("ages.txt", "w") do |file|
|
50
|
-
i.table.each do |row|
|
51
|
-
unless row[:row_errors].any?
|
52
|
-
file.puts "#{row[:name][:value]} #{row[:age][:value]}"
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|
Binary file
|
Binary file
|
Binary file
|
data/examples/age_noext/age.rb
DELETED
@@ -1,73 +0,0 @@
|
|
1
|
-
require "dreader"
|
2
|
-
|
3
|
-
class Reader
|
4
|
-
extend Dreader::Engine
|
5
|
-
|
6
|
-
options do
|
7
|
-
first_row 2
|
8
|
-
debug true
|
9
|
-
extension :ods
|
10
|
-
end
|
11
|
-
|
12
|
-
column :name do
|
13
|
-
doc "A is the name string"
|
14
|
-
colref 'A'
|
15
|
-
end
|
16
|
-
|
17
|
-
column :birthdate do
|
18
|
-
doc "Birthdate contains a full date (i.e., including the year)"
|
19
|
-
colref 'B'
|
20
|
-
|
21
|
-
process do |c|
|
22
|
-
Date.parse(c)
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
virtual_column :age do
|
27
|
-
process do |row|
|
28
|
-
birthdate = row[:birthdate][:value]
|
29
|
-
birthday = Date.new(Date.today.year, birthdate.month, birthdate.day)
|
30
|
-
today = Date.today
|
31
|
-
|
32
|
-
[0, today.year - birthdate.year - (birthday < today ? 1 : 0)].max
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
|
-
mapping do |row|
|
37
|
-
r = Dreader::Util.simplify(row)
|
38
|
-
puts "#{r[:name]} is #{r[:age]} years old (born on #{r[:birthdate]})"
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
puts
|
43
|
-
puts "*****************************************************************"
|
44
|
-
puts "Reading ODS with no extension, using extension set in the options"
|
45
|
-
puts "*****************************************************************"
|
46
|
-
puts
|
47
|
-
|
48
|
-
i = Reader
|
49
|
-
i.read filename: "Birthdays"
|
50
|
-
i.virtual_columns
|
51
|
-
i.mappings
|
52
|
-
|
53
|
-
puts
|
54
|
-
puts "*****************************************************************"
|
55
|
-
puts "Reading XLSX with wrong extension, overriding existing extension"
|
56
|
-
puts "*****************************************************************"
|
57
|
-
puts
|
58
|
-
|
59
|
-
i = Reader
|
60
|
-
i.read filename: "Birthdays-xlsx-with-wrong-extension.xls", extension: :xlsx
|
61
|
-
i.virtual_columns
|
62
|
-
i.mappings
|
63
|
-
|
64
|
-
puts
|
65
|
-
puts "*****************************************************************"
|
66
|
-
puts "Reading XLSX with no extension"
|
67
|
-
puts "*****************************************************************"
|
68
|
-
puts
|
69
|
-
|
70
|
-
i = Reader
|
71
|
-
i.read filename: "Birthdays-xlsx", extension: :xlsx
|
72
|
-
i.virtual_columns
|
73
|
-
i.mappings
|
@@ -1,77 +0,0 @@
|
|
1
|
-
require 'dreader'
|
2
|
-
|
3
|
-
# this is the class which will contain all the data we read from the file
|
4
|
-
class City
|
5
|
-
[:city, :state, :population, :lat, :lon].each do |var|
|
6
|
-
attr_accessor var
|
7
|
-
end
|
8
|
-
|
9
|
-
def initialize(hash)
|
10
|
-
hash.each do |k, v|
|
11
|
-
self.send("#{k}=", v)
|
12
|
-
end
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
|
-
class Importer
|
17
|
-
extend Dreader::Engine
|
18
|
-
|
19
|
-
# read from us_cities.tsv, lines from 2 to 10 (included)
|
20
|
-
options do
|
21
|
-
filename "us_cities.tsv"
|
22
|
-
first_row 2
|
23
|
-
last_row 307
|
24
|
-
end
|
25
|
-
|
26
|
-
# these are the columns for which we only need to specify column and name
|
27
|
-
columns ({city: 2, state: 3, latlon: 11}) do
|
28
|
-
process { |val| val.strip }
|
29
|
-
end
|
30
|
-
|
31
|
-
# the population column requires more work
|
32
|
-
column :population do |col|
|
33
|
-
col.colref 4
|
34
|
-
|
35
|
-
# make "3,000" into 3000 (int)
|
36
|
-
col.process { |value| value.gsub(",", "").to_i }
|
37
|
-
|
38
|
-
# check population is positive
|
39
|
-
col.check { |value| value > 0 }
|
40
|
-
end
|
41
|
-
|
42
|
-
# reject all cities with more than 3M people
|
43
|
-
reject do |row|
|
44
|
-
row[:population][:value] >= 3_000_000
|
45
|
-
end
|
46
|
-
|
47
|
-
mapping do |row|
|
48
|
-
# remove all additional information stored in each cell
|
49
|
-
r = Dreader::Util.simplify row
|
50
|
-
|
51
|
-
# make latlon into the lat, lon fields
|
52
|
-
r[:lat], r[:lon] = r[:latlon].split(" ")
|
53
|
-
|
54
|
-
# now r contains something like
|
55
|
-
# {lat: ..., lon: ..., city: ..., state: ..., population: ..., latlon: ...}
|
56
|
-
|
57
|
-
# remove fields which are not understood by the Cities class and
|
58
|
-
# make a new instance
|
59
|
-
cleaned = Dreader::Util.clean r, [:latlon]
|
60
|
-
|
61
|
-
# you must declare an array cities before calling importer.mapping
|
62
|
-
City.new(cleaned)
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
# load and process
|
67
|
-
importer = Importer
|
68
|
-
importer.load mapping: true, debug: true
|
69
|
-
|
70
|
-
# output everything to see whether it works
|
71
|
-
puts "First ten cities in the US with less than 3M (source Wikipedia)"
|
72
|
-
importer.table.each do |city|
|
73
|
-
[:city, :state, :population, :lat, :lon].each do |var|
|
74
|
-
puts "#{var.to_s.capitalize}: #{city.send(var)}"
|
75
|
-
end
|
76
|
-
puts ""
|
77
|
-
end
|