dreader 1.1.2 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.org +7 -0
- data/README.org +31 -6
- data/examples/wikipedia_us_cities/us_cities_reject.rb +77 -0
- data/lib/dreader/engine.rb +42 -6
- data/lib/dreader/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3c30be2fe49c6c8ce20d4930c75f1279ac1a92a099f609b1266b14dc61c7cf3c
|
4
|
+
data.tar.gz: 58c735a67c45ef11a180bc6f17892ba912656d346320da1a716caa69661f4695
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7847892dbcf648432a9c51867fd70e1260e956e82ea7cbbad93f92882dd867be6f36f67a0edfbb972e16e12c1364efe92a54bd03d31801770da4b28fac725350
|
7
|
+
data.tar.gz: a717955a2eaa0c406d6fb140daf9cd084c11e0d4710289de34b7757b5fd4f4e920ded4fd1450306e3e42a42278c983e078ff8e53248fe0f5b7a93d09fb8a9d40
|
data/CHANGELOG.org
CHANGED
@@ -1,5 +1,12 @@
|
|
1
1
|
#+TITLE: Changelog
|
2
2
|
|
3
|
+
* Version 1.2.0 - <2023-11-02 Thu>
|
4
|
+
** reject declaration
|
5
|
+
|
6
|
+
- A new reject declaration allows to reject some lines. reject takes as
|
7
|
+
input a row and can predicate over columns and virtual columns. When
|
8
|
+
true, the corresponding line is discarded.
|
9
|
+
|
3
10
|
* Version 1.1.2 - <2023-10-31 Tue>
|
4
11
|
** Fixes an issue with the :extension option
|
5
12
|
|
data/README.org
CHANGED
@@ -137,7 +137,8 @@ To write an import function with Dreader:
|
|
137
137
|
and check parsed data
|
138
138
|
- Add virtual columns, that is, columns computed from other values
|
139
139
|
in the row
|
140
|
-
- Specify
|
140
|
+
- Specify what lines you want to reject, if any
|
141
|
+
- Specify how to transform lines. This is where you do the actual work
|
141
142
|
(for instance, if you process a file line by line) or put together data for
|
142
143
|
processing after the file has been fully read --- see the next step.
|
143
144
|
|
@@ -398,6 +399,9 @@ See [[file:examples/wikipedia_us_cities/us_cities_bulk_declare.rb][us_cities_bul
|
|
398
399
|
hash from the code block.
|
399
400
|
#+END_NOTES
|
400
401
|
|
402
|
+
The data read from each row of our input data is stored in a hash. The hash
|
403
|
+
uses column names as the primary key and stores the values in the =:value=
|
404
|
+
key.
|
401
405
|
|
402
406
|
*** Add virtual columns
|
403
407
|
|
@@ -427,6 +431,22 @@ Virtual columns are, of course, available to the =mapping= directive
|
|
427
431
|
(see below).
|
428
432
|
|
429
433
|
|
434
|
+
*** Specify which lines to reject
|
435
|
+
|
436
|
+
You can reject some lines using the =reject= declaration, which is applied row
|
437
|
+
by row, can predicate over columns and virtual columns, and has to return a
|
438
|
+
Boolean value.
|
439
|
+
|
440
|
+
All lines returning a truish value will be be rejected, that is, not stored in
|
441
|
+
the =@table= variable (and, consequently, passed to the mapping function).
|
442
|
+
|
443
|
+
For instance, the following declaration rejects all lines in which the
|
444
|
+
population column is higher than =3_000_000=:
|
445
|
+
|
446
|
+
#+begin_src ruby
|
447
|
+
reject { |row| row[:population][:value] > 3_000_000 }
|
448
|
+
#+end_src
|
449
|
+
|
430
450
|
*** Specify how to process each line
|
431
451
|
|
432
452
|
The =mapping= directive specifies what to do with each line read. The
|
@@ -442,10 +462,9 @@ value of column =:age= and prints them to standard output
|
|
442
462
|
end
|
443
463
|
#+END_EXAMPLE
|
444
464
|
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
465
|
+
To invoke the =mapping= declaration on a file, use the =mappings= method,
|
466
|
+
which invokes =map= to each row and it stores in the =@table= variable
|
467
|
+
whatever value mapping returns.
|
449
468
|
|
450
469
|
*** Process data
|
451
470
|
|
@@ -501,7 +520,13 @@ A typical scenario works as follows:
|
|
501
520
|
(Optionally: check again for errors.)
|
502
521
|
|
503
522
|
5. Add your own code to process the data returned after =mappings=, which you
|
504
|
-
can
|
523
|
+
can assign to a variable (e.g., =returned_data = i.mappings=) or access
|
524
|
+
with =i.table= or =i.data= (synonyms).
|
525
|
+
|
526
|
+
#+begin_quote
|
527
|
+
Notice that =mappings= does a side effect and invoking the mapping twice in a
|
528
|
+
row won't work: you need to reload the file first.
|
529
|
+
#+end_quote
|
505
530
|
|
506
531
|
Look in the examples directory for further details and a couple of working
|
507
532
|
examples.
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'dreader'
|
2
|
+
|
3
|
+
# this is the class which will contain all the data we read from the file
|
4
|
+
class City
|
5
|
+
[:city, :state, :population, :lat, :lon].each do |var|
|
6
|
+
attr_accessor var
|
7
|
+
end
|
8
|
+
|
9
|
+
def initialize(hash)
|
10
|
+
hash.each do |k, v|
|
11
|
+
self.send("#{k}=", v)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class Importer
|
17
|
+
extend Dreader::Engine
|
18
|
+
|
19
|
+
# read from us_cities.tsv, lines from 2 to 10 (included)
|
20
|
+
options do
|
21
|
+
filename "us_cities.tsv"
|
22
|
+
first_row 2
|
23
|
+
last_row 307
|
24
|
+
end
|
25
|
+
|
26
|
+
# these are the columns for which we only need to specify column and name
|
27
|
+
columns ({city: 2, state: 3, latlon: 11}) do
|
28
|
+
process { |val| val.strip }
|
29
|
+
end
|
30
|
+
|
31
|
+
# the population column requires more work
|
32
|
+
column :population do |col|
|
33
|
+
col.colref 4
|
34
|
+
|
35
|
+
# make "3,000" into 3000 (int)
|
36
|
+
col.process { |value| value.gsub(",", "").to_i }
|
37
|
+
|
38
|
+
# check population is positive
|
39
|
+
col.check { |value| value > 0 }
|
40
|
+
end
|
41
|
+
|
42
|
+
# reject all cities with more than 3M people
|
43
|
+
reject do |row|
|
44
|
+
row[:population][:value] >= 3_000_000
|
45
|
+
end
|
46
|
+
|
47
|
+
mapping do |row|
|
48
|
+
# remove all additional information stored in each cell
|
49
|
+
r = Dreader::Util.simplify row
|
50
|
+
|
51
|
+
# make latlon into the lat, lon fields
|
52
|
+
r[:lat], r[:lon] = r[:latlon].split(" ")
|
53
|
+
|
54
|
+
# now r contains something like
|
55
|
+
# {lat: ..., lon: ..., city: ..., state: ..., population: ..., latlon: ...}
|
56
|
+
|
57
|
+
# remove fields which are not understood by the Cities class and
|
58
|
+
# make a new instance
|
59
|
+
cleaned = Dreader::Util.clean r, [:latlon]
|
60
|
+
|
61
|
+
# you must declare an array cities before calling importer.mapping
|
62
|
+
City.new(cleaned)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# load and process
|
67
|
+
importer = Importer
|
68
|
+
importer.load mapping: true, debug: true
|
69
|
+
|
70
|
+
# output everything to see whether it works
|
71
|
+
puts "First ten cities in the US with less than 3M (source Wikipedia)"
|
72
|
+
importer.table.each do |city|
|
73
|
+
[:city, :state, :population, :lat, :lon].each do |var|
|
74
|
+
puts "#{var.to_s.capitalize}: #{city.send(var)}"
|
75
|
+
end
|
76
|
+
puts ""
|
77
|
+
end
|
data/lib/dreader/engine.rb
CHANGED
@@ -21,6 +21,8 @@ module Dreader
|
|
21
21
|
attr_accessor :declared_virtual_columns
|
22
22
|
# the mapping rules
|
23
23
|
attr_accessor :declared_mapping
|
24
|
+
# the declared filter
|
25
|
+
attr_accessor :declared_reject
|
24
26
|
|
25
27
|
# the data we read
|
26
28
|
attr_reader :table
|
@@ -118,6 +120,11 @@ module Dreader
|
|
118
120
|
@declared_virtual_columns << column.to_hash.merge({ name: name })
|
119
121
|
end
|
120
122
|
|
123
|
+
# define a filter, which skips some rows
|
124
|
+
def reject(&block)
|
125
|
+
@declared_reject = block
|
126
|
+
end
|
127
|
+
|
121
128
|
# define what we do with each line we read
|
122
129
|
# - `block` is the code which takes as input a `row` and processes
|
123
130
|
# `row` is a hash in which each spreadsheet cell is accessible under
|
@@ -187,8 +194,13 @@ module Dreader
|
|
187
194
|
# this has side-effects on r
|
188
195
|
virtual_columns_on(r) if options[:virtual] || options[:mapping]
|
189
196
|
|
197
|
+
# check whether the filter would ignore this line
|
198
|
+
# notice that we need to invoke compact to avoid nil being added
|
199
|
+
# to the table
|
200
|
+
next if !options[:ignore_reject] && reject?(r)
|
201
|
+
|
190
202
|
options[:mapping] ? mappings_on(r) : r
|
191
|
-
end
|
203
|
+
end.compact
|
192
204
|
end
|
193
205
|
|
194
206
|
# TODO: PASS A ROW (and not row_number and sheet)
|
@@ -268,6 +280,7 @@ module Dreader
|
|
268
280
|
|
269
281
|
# Compute virtual columns for, with side effect on row
|
270
282
|
def virtual_columns_on(row)
|
283
|
+
@declared_virtual_columns ||= []
|
271
284
|
@declared_virtual_columns.each do |virtualcol|
|
272
285
|
colname = virtualcol[:name]
|
273
286
|
row[colname] = { virtual: true }
|
@@ -291,13 +304,36 @@ module Dreader
|
|
291
304
|
end
|
292
305
|
end
|
293
306
|
|
294
|
-
#
|
295
|
-
|
307
|
+
# check whether a line has to be rejected
|
308
|
+
def reject?(row)
|
309
|
+
rejected = @declared_reject&.call(row)
|
310
|
+
if rejected
|
311
|
+
@logger.debug "[dreader] row rejected by reject declaration #{row}"
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
# apply the mapping code to the @table. Notice that we do a side effect
|
316
|
+
# on @table and, hence, invoking the mapping twice won't work (you need to
|
317
|
+
# reload first).
|
318
|
+
#
|
319
|
+
# the mapping is applied only if it defined and it returns the output of
|
320
|
+
# the mapping.
|
321
|
+
#
|
322
|
+
# notice also that we do a side-effect on @table. This is to make the
|
323
|
+
# behavior of
|
324
|
+
#
|
325
|
+
# i.load mapping: true
|
326
|
+
# i.table
|
327
|
+
#
|
328
|
+
# and
|
329
|
+
#
|
330
|
+
# i = load;
|
331
|
+
# i.mappings
|
332
|
+
# i.table
|
296
333
|
#
|
297
|
-
# the
|
298
|
-
# it can be used functionally
|
334
|
+
# the same
|
299
335
|
def mappings
|
300
|
-
@table.map { |row| mappings_on(row) }
|
336
|
+
@table = @table.map { |row| mappings_on(row) }
|
301
337
|
end
|
302
338
|
|
303
339
|
def mappings_on(row)
|
data/lib/dreader/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dreader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adolfo Villafiorita
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-11-
|
11
|
+
date: 2023-11-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: roo
|
@@ -124,6 +124,7 @@ files:
|
|
124
124
|
- examples/wikipedia_us_cities/us_cities.rb
|
125
125
|
- examples/wikipedia_us_cities/us_cities.tsv
|
126
126
|
- examples/wikipedia_us_cities/us_cities_bulk_declare.rb
|
127
|
+
- examples/wikipedia_us_cities/us_cities_reject.rb
|
127
128
|
- lib/dreader.rb
|
128
129
|
- lib/dreader/column.rb
|
129
130
|
- lib/dreader/engine.rb
|
@@ -149,7 +150,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
149
150
|
- !ruby/object:Gem::Version
|
150
151
|
version: '0'
|
151
152
|
requirements: []
|
152
|
-
rubygems_version: 3.4.
|
153
|
+
rubygems_version: 3.4.21
|
153
154
|
signing_key:
|
154
155
|
specification_version: 4
|
155
156
|
summary: Process and import data from cvs and spreadsheets
|