dreader 1.1.2 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.org +7 -0
- data/README.org +31 -6
- data/examples/wikipedia_us_cities/us_cities_reject.rb +77 -0
- data/lib/dreader/engine.rb +42 -6
- data/lib/dreader/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3c30be2fe49c6c8ce20d4930c75f1279ac1a92a099f609b1266b14dc61c7cf3c
|
4
|
+
data.tar.gz: 58c735a67c45ef11a180bc6f17892ba912656d346320da1a716caa69661f4695
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7847892dbcf648432a9c51867fd70e1260e956e82ea7cbbad93f92882dd867be6f36f67a0edfbb972e16e12c1364efe92a54bd03d31801770da4b28fac725350
|
7
|
+
data.tar.gz: a717955a2eaa0c406d6fb140daf9cd084c11e0d4710289de34b7757b5fd4f4e920ded4fd1450306e3e42a42278c983e078ff8e53248fe0f5b7a93d09fb8a9d40
|
data/CHANGELOG.org
CHANGED
@@ -1,5 +1,12 @@
|
|
1
1
|
#+TITLE: Changelog
|
2
2
|
|
3
|
+
* Version 1.2.0 - <2023-11-02 Thu>
|
4
|
+
** reject declaration
|
5
|
+
|
6
|
+
- A new reject declaration allows to reject some lines. reject takes as
|
7
|
+
input a row and can predicate over columns and virtual columns. When
|
8
|
+
true, the corresponding line is discarded.
|
9
|
+
|
3
10
|
* Version 1.1.2 - <2023-10-31 Tue>
|
4
11
|
** Fixes an issue with the :extension option
|
5
12
|
|
data/README.org
CHANGED
@@ -137,7 +137,8 @@ To write an import function with Dreader:
|
|
137
137
|
and check parsed data
|
138
138
|
- Add virtual columns, that is, columns computed from other values
|
139
139
|
in the row
|
140
|
-
- Specify
|
140
|
+
- Specify what lines you want to reject, if any
|
141
|
+
- Specify how to transform lines. This is where you do the actual work
|
141
142
|
(for instance, if you process a file line by line) or put together data for
|
142
143
|
processing after the file has been fully read --- see the next step.
|
143
144
|
|
@@ -398,6 +399,9 @@ See [[file:examples/wikipedia_us_cities/us_cities_bulk_declare.rb][us_cities_bul
|
|
398
399
|
hash from the code block.
|
399
400
|
#+END_NOTES
|
400
401
|
|
402
|
+
The data read from each row of our input data is stored in a hash. The hash
|
403
|
+
uses column names as the primary key and stores the values in the =:value=
|
404
|
+
key.
|
401
405
|
|
402
406
|
*** Add virtual columns
|
403
407
|
|
@@ -427,6 +431,22 @@ Virtual columns are, of course, available to the =mapping= directive
|
|
427
431
|
(see below).
|
428
432
|
|
429
433
|
|
434
|
+
*** Specify which lines to reject
|
435
|
+
|
436
|
+
You can reject some lines using the =reject= declaration, which is applied row
|
437
|
+
by row, can predicate over columns and virtual columns, and has to return a
|
438
|
+
Boolean value.
|
439
|
+
|
440
|
+
All lines returning a truish value will be be rejected, that is, not stored in
|
441
|
+
the =@table= variable (and, consequently, passed to the mapping function).
|
442
|
+
|
443
|
+
For instance, the following declaration rejects all lines in which the
|
444
|
+
population column is higher than =3_000_000=:
|
445
|
+
|
446
|
+
#+begin_src ruby
|
447
|
+
reject { |row| row[:population][:value] > 3_000_000 }
|
448
|
+
#+end_src
|
449
|
+
|
430
450
|
*** Specify how to process each line
|
431
451
|
|
432
452
|
The =mapping= directive specifies what to do with each line read. The
|
@@ -442,10 +462,9 @@ value of column =:age= and prints them to standard output
|
|
442
462
|
end
|
443
463
|
#+END_EXAMPLE
|
444
464
|
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
465
|
+
To invoke the =mapping= declaration on a file, use the =mappings= method,
|
466
|
+
which invokes =map= to each row and it stores in the =@table= variable
|
467
|
+
whatever value mapping returns.
|
449
468
|
|
450
469
|
*** Process data
|
451
470
|
|
@@ -501,7 +520,13 @@ A typical scenario works as follows:
|
|
501
520
|
(Optionally: check again for errors.)
|
502
521
|
|
503
522
|
5. Add your own code to process the data returned after =mappings=, which you
|
504
|
-
can
|
523
|
+
can assign to a variable (e.g., =returned_data = i.mappings=) or access
|
524
|
+
with =i.table= or =i.data= (synonyms).
|
525
|
+
|
526
|
+
#+begin_quote
|
527
|
+
Notice that =mappings= does a side effect and invoking the mapping twice in a
|
528
|
+
row won't work: you need to reload the file first.
|
529
|
+
#+end_quote
|
505
530
|
|
506
531
|
Look in the examples directory for further details and a couple of working
|
507
532
|
examples.
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'dreader'
|
2
|
+
|
3
|
+
# this is the class which will contain all the data we read from the file
|
4
|
+
class City
|
5
|
+
[:city, :state, :population, :lat, :lon].each do |var|
|
6
|
+
attr_accessor var
|
7
|
+
end
|
8
|
+
|
9
|
+
def initialize(hash)
|
10
|
+
hash.each do |k, v|
|
11
|
+
self.send("#{k}=", v)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class Importer
|
17
|
+
extend Dreader::Engine
|
18
|
+
|
19
|
+
# read from us_cities.tsv, lines from 2 to 10 (included)
|
20
|
+
options do
|
21
|
+
filename "us_cities.tsv"
|
22
|
+
first_row 2
|
23
|
+
last_row 307
|
24
|
+
end
|
25
|
+
|
26
|
+
# these are the columns for which we only need to specify column and name
|
27
|
+
columns ({city: 2, state: 3, latlon: 11}) do
|
28
|
+
process { |val| val.strip }
|
29
|
+
end
|
30
|
+
|
31
|
+
# the population column requires more work
|
32
|
+
column :population do |col|
|
33
|
+
col.colref 4
|
34
|
+
|
35
|
+
# make "3,000" into 3000 (int)
|
36
|
+
col.process { |value| value.gsub(",", "").to_i }
|
37
|
+
|
38
|
+
# check population is positive
|
39
|
+
col.check { |value| value > 0 }
|
40
|
+
end
|
41
|
+
|
42
|
+
# reject all cities with more than 3M people
|
43
|
+
reject do |row|
|
44
|
+
row[:population][:value] >= 3_000_000
|
45
|
+
end
|
46
|
+
|
47
|
+
mapping do |row|
|
48
|
+
# remove all additional information stored in each cell
|
49
|
+
r = Dreader::Util.simplify row
|
50
|
+
|
51
|
+
# make latlon into the lat, lon fields
|
52
|
+
r[:lat], r[:lon] = r[:latlon].split(" ")
|
53
|
+
|
54
|
+
# now r contains something like
|
55
|
+
# {lat: ..., lon: ..., city: ..., state: ..., population: ..., latlon: ...}
|
56
|
+
|
57
|
+
# remove fields which are not understood by the Cities class and
|
58
|
+
# make a new instance
|
59
|
+
cleaned = Dreader::Util.clean r, [:latlon]
|
60
|
+
|
61
|
+
# you must declare an array cities before calling importer.mapping
|
62
|
+
City.new(cleaned)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# load and process
|
67
|
+
importer = Importer
|
68
|
+
importer.load mapping: true, debug: true
|
69
|
+
|
70
|
+
# output everything to see whether it works
|
71
|
+
puts "First ten cities in the US with less than 3M (source Wikipedia)"
|
72
|
+
importer.table.each do |city|
|
73
|
+
[:city, :state, :population, :lat, :lon].each do |var|
|
74
|
+
puts "#{var.to_s.capitalize}: #{city.send(var)}"
|
75
|
+
end
|
76
|
+
puts ""
|
77
|
+
end
|
data/lib/dreader/engine.rb
CHANGED
@@ -21,6 +21,8 @@ module Dreader
|
|
21
21
|
attr_accessor :declared_virtual_columns
|
22
22
|
# the mapping rules
|
23
23
|
attr_accessor :declared_mapping
|
24
|
+
# the declared filter
|
25
|
+
attr_accessor :declared_reject
|
24
26
|
|
25
27
|
# the data we read
|
26
28
|
attr_reader :table
|
@@ -118,6 +120,11 @@ module Dreader
|
|
118
120
|
@declared_virtual_columns << column.to_hash.merge({ name: name })
|
119
121
|
end
|
120
122
|
|
123
|
+
# define a filter, which skips some rows
|
124
|
+
def reject(&block)
|
125
|
+
@declared_reject = block
|
126
|
+
end
|
127
|
+
|
121
128
|
# define what we do with each line we read
|
122
129
|
# - `block` is the code which takes as input a `row` and processes
|
123
130
|
# `row` is a hash in which each spreadsheet cell is accessible under
|
@@ -187,8 +194,13 @@ module Dreader
|
|
187
194
|
# this has side-effects on r
|
188
195
|
virtual_columns_on(r) if options[:virtual] || options[:mapping]
|
189
196
|
|
197
|
+
# check whether the filter would ignore this line
|
198
|
+
# notice that we need to invoke compact to avoid nil being added
|
199
|
+
# to the table
|
200
|
+
next if !options[:ignore_reject] && reject?(r)
|
201
|
+
|
190
202
|
options[:mapping] ? mappings_on(r) : r
|
191
|
-
end
|
203
|
+
end.compact
|
192
204
|
end
|
193
205
|
|
194
206
|
# TODO: PASS A ROW (and not row_number and sheet)
|
@@ -268,6 +280,7 @@ module Dreader
|
|
268
280
|
|
269
281
|
# Compute virtual columns for, with side effect on row
|
270
282
|
def virtual_columns_on(row)
|
283
|
+
@declared_virtual_columns ||= []
|
271
284
|
@declared_virtual_columns.each do |virtualcol|
|
272
285
|
colname = virtualcol[:name]
|
273
286
|
row[colname] = { virtual: true }
|
@@ -291,13 +304,36 @@ module Dreader
|
|
291
304
|
end
|
292
305
|
end
|
293
306
|
|
294
|
-
#
|
295
|
-
|
307
|
+
# check whether a line has to be rejected
|
308
|
+
def reject?(row)
|
309
|
+
rejected = @declared_reject&.call(row)
|
310
|
+
if rejected
|
311
|
+
@logger.debug "[dreader] row rejected by reject declaration #{row}"
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
# apply the mapping code to the @table. Notice that we do a side effect
|
316
|
+
# on @table and, hence, invoking the mapping twice won't work (you need to
|
317
|
+
# reload first).
|
318
|
+
#
|
319
|
+
# the mapping is applied only if it defined and it returns the output of
|
320
|
+
# the mapping.
|
321
|
+
#
|
322
|
+
# notice also that we do a side-effect on @table. This is to make the
|
323
|
+
# behavior of
|
324
|
+
#
|
325
|
+
# i.load mapping: true
|
326
|
+
# i.table
|
327
|
+
#
|
328
|
+
# and
|
329
|
+
#
|
330
|
+
# i = load;
|
331
|
+
# i.mappings
|
332
|
+
# i.table
|
296
333
|
#
|
297
|
-
# the
|
298
|
-
# it can be used functionally
|
334
|
+
# the same
|
299
335
|
def mappings
|
300
|
-
@table.map { |row| mappings_on(row) }
|
336
|
+
@table = @table.map { |row| mappings_on(row) }
|
301
337
|
end
|
302
338
|
|
303
339
|
def mappings_on(row)
|
data/lib/dreader/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dreader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adolfo Villafiorita
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-11-
|
11
|
+
date: 2023-11-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: roo
|
@@ -124,6 +124,7 @@ files:
|
|
124
124
|
- examples/wikipedia_us_cities/us_cities.rb
|
125
125
|
- examples/wikipedia_us_cities/us_cities.tsv
|
126
126
|
- examples/wikipedia_us_cities/us_cities_bulk_declare.rb
|
127
|
+
- examples/wikipedia_us_cities/us_cities_reject.rb
|
127
128
|
- lib/dreader.rb
|
128
129
|
- lib/dreader/column.rb
|
129
130
|
- lib/dreader/engine.rb
|
@@ -149,7 +150,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
149
150
|
- !ruby/object:Gem::Version
|
150
151
|
version: '0'
|
151
152
|
requirements: []
|
152
|
-
rubygems_version: 3.4.
|
153
|
+
rubygems_version: 3.4.21
|
153
154
|
signing_key:
|
154
155
|
specification_version: 4
|
155
156
|
summary: Process and import data from cvs and spreadsheets
|