cure 0.1.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +13 -3
- data/.tool-versions +1 -0
- data/Dockerfile +1 -1
- data/Gemfile +1 -0
- data/Gemfile.lock +25 -6
- data/README.md +61 -93
- data/docs/README.md +33 -0
- data/docs/about.md +219 -0
- data/docs/builder/add.md +52 -0
- data/docs/builder/black_white_list.md +83 -0
- data/docs/builder/copy.md +48 -0
- data/docs/builder/explode.md +70 -0
- data/docs/builder/main.md +43 -0
- data/docs/builder/remove.md +46 -0
- data/docs/examples/examples.md +164 -0
- data/docs/export/main.md +37 -0
- data/docs/extract/main.md +89 -0
- data/docs/metadata/main.md +29 -0
- data/docs/query/main.md +45 -0
- data/docs/sources/main.md +36 -0
- data/docs/transform/main.md +53 -0
- data/docs/validate/main.md +42 -0
- data/exe/cure +12 -41
- data/exe/cure.old +59 -0
- data/lib/cure/builder/base_builder.rb +151 -0
- data/lib/cure/builder/candidate.rb +56 -0
- data/lib/cure/cli/command.rb +105 -0
- data/lib/cure/cli/generate_command.rb +54 -0
- data/lib/cure/cli/new_command.rb +52 -0
- data/lib/cure/cli/run_command.rb +19 -0
- data/lib/cure/cli/templates/README.md.erb +1 -0
- data/lib/cure/cli/templates/gemfile.erb +5 -0
- data/lib/cure/cli/templates/gitignore.erb +181 -0
- data/lib/cure/cli/templates/new_template.rb.erb +31 -0
- data/lib/cure/cli/templates/tool-versions.erb +1 -0
- data/lib/cure/config.rb +142 -18
- data/lib/cure/coordinator.rb +61 -25
- data/lib/cure/database.rb +191 -0
- data/lib/cure/dsl/builder.rb +26 -0
- data/lib/cure/dsl/exporters.rb +45 -0
- data/lib/cure/dsl/extraction.rb +60 -0
- data/lib/cure/dsl/metadata.rb +33 -0
- data/lib/cure/dsl/queries.rb +36 -0
- data/lib/cure/dsl/source_files.rb +36 -0
- data/lib/cure/dsl/template.rb +131 -0
- data/lib/cure/dsl/transformations.rb +95 -0
- data/lib/cure/dsl/validator.rb +22 -0
- data/lib/cure/export/base_processor.rb +194 -0
- data/lib/cure/export/manager.rb +24 -0
- data/lib/cure/extract/base_processor.rb +47 -0
- data/lib/cure/extract/csv_lookup.rb +14 -3
- data/lib/cure/extract/extractor.rb +41 -84
- data/lib/cure/extract/filter.rb +118 -0
- data/lib/cure/extract/named_range.rb +94 -0
- data/lib/cure/extract/named_range_processor.rb +128 -0
- data/lib/cure/extract/variable.rb +25 -0
- data/lib/cure/extract/variable_processor.rb +57 -0
- data/lib/cure/generator/base_generator.rb +14 -4
- data/lib/cure/generator/case_generator.rb +10 -3
- data/lib/cure/generator/character_generator.rb +9 -3
- data/lib/cure/generator/erb_generator.rb +21 -0
- data/lib/cure/generator/eval_generator.rb +34 -0
- data/lib/cure/generator/faker_generator.rb +7 -1
- data/lib/cure/generator/guid_generator.rb +7 -2
- data/lib/cure/generator/hex_generator.rb +6 -1
- data/lib/cure/generator/imports.rb +4 -0
- data/lib/cure/generator/number_generator.rb +6 -1
- data/lib/cure/generator/placeholder_generator.rb +7 -1
- data/lib/cure/generator/proc_generator.rb +21 -0
- data/lib/cure/generator/redact_generator.rb +9 -3
- data/lib/cure/generator/static_generator.rb +21 -0
- data/lib/cure/generator/variable_generator.rb +11 -5
- data/lib/cure/helpers/file_helpers.rb +12 -2
- data/lib/cure/helpers/object_helpers.rb +5 -17
- data/lib/cure/helpers/perf_helpers.rb +30 -0
- data/lib/cure/helpers/string.rb +54 -0
- data/lib/cure/launcher.rb +125 -0
- data/lib/cure/log.rb +7 -0
- data/lib/cure/planner.rb +136 -0
- data/lib/cure/strategy/append_strategy.rb +4 -0
- data/lib/cure/strategy/base_strategy.rb +19 -44
- data/lib/cure/strategy/contain_strategy.rb +51 -0
- data/lib/cure/strategy/end_with_strategy.rb +7 -1
- data/lib/cure/strategy/full_strategy.rb +4 -0
- data/lib/cure/strategy/history/history_cache.rb +82 -0
- data/lib/cure/strategy/imports.rb +2 -0
- data/lib/cure/strategy/match_strategy.rb +7 -2
- data/lib/cure/strategy/prepend_strategy.rb +28 -0
- data/lib/cure/strategy/regex_strategy.rb +7 -1
- data/lib/cure/strategy/split_strategy.rb +8 -3
- data/lib/cure/strategy/start_with_strategy.rb +7 -1
- data/lib/cure/transformation/candidate.rb +32 -35
- data/lib/cure/transformation/transform.rb +22 -56
- data/lib/cure/validator/base_rule.rb +78 -0
- data/lib/cure/validator/candidate.rb +54 -0
- data/lib/cure/validator/manager.rb +21 -0
- data/lib/cure/validators.rb +3 -3
- data/lib/cure/version.rb +1 -1
- data/lib/cure.rb +19 -11
- data/templates/dsl_example.rb +48 -0
- data/templates/empty_template.rb +31 -0
- metadata +132 -21
- data/lib/cure/export/exporter.rb +0 -74
- data/lib/cure/extract/builder.rb +0 -27
- data/lib/cure/main.rb +0 -72
- data/lib/cure/template/dispatch.rb +0 -30
- data/lib/cure/template/extraction.rb +0 -38
- data/lib/cure/template/template.rb +0 -28
- data/lib/cure/template/transformations.rb +0 -26
- data/templates/aws_cur_template.json +0 -145
- data/templates/example_template.json +0 -54
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 78d9b4b8ba9b29e7e299811d971a41f3600cbf7df4f1e17729b083f9573f1b49
|
4
|
+
data.tar.gz: f9fc2bfbc4ce3dfbb7769bc4af7cd5af2ed10c72e0d236887787c4a63a6a2d34
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: eccf1c880d1c04653364621f3d2136c9bb049e8b8be07e31ed2f763fedb7ceb4ae4b92ecb1a86baf8314989e0bfc52d51846c5461a782bdb757c0cc13f337992
|
7
|
+
data.tar.gz: d84dcc4031ff49cac80877478df86f9a276743f1815ac5386a46bd408788c6d4ef29e1edfab2648ca0d15c89925282cd5b5f076e2813cf7ad863628fe418c0ef
|
data/.rubocop.yml
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
AllCops:
|
2
|
-
TargetRubyVersion: 2
|
2
|
+
TargetRubyVersion: 3.2
|
3
|
+
Exclude:
|
4
|
+
- 'spec/**/*'
|
3
5
|
|
4
6
|
Style/StringLiterals:
|
5
7
|
Enabled: true
|
@@ -12,10 +14,13 @@ Style/StringLiteralsInInterpolation:
|
|
12
14
|
Layout/LineLength:
|
13
15
|
Max: 120
|
14
16
|
|
17
|
+
Metrics/BlockLength:
|
18
|
+
Max: 40
|
19
|
+
|
15
20
|
# Too short methods lead to extraction of single-use methods, which can make
|
16
21
|
# the code easier to read (by naming things), but can also clutter the class
|
17
22
|
Metrics/MethodLength:
|
18
|
-
Max:
|
23
|
+
Max: 40
|
19
24
|
|
20
25
|
# The guiding principle of classes is SRP, SRP can't be accurately measured by LoC
|
21
26
|
Metrics/ClassLength:
|
@@ -132,4 +137,9 @@ Lint/Debugger:
|
|
132
137
|
|
133
138
|
# Style preference
|
134
139
|
Style/MethodDefParentheses:
|
135
|
-
Enabled: false
|
140
|
+
Enabled: false
|
141
|
+
|
142
|
+
Layout/FirstHashElementIndentation:
|
143
|
+
Enabled: true
|
144
|
+
EnforcedStyle: consistent
|
145
|
+
IndentationWidth: ~
|
data/.tool-versions
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
ruby 3.2.1
|
data/Dockerfile
CHANGED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,20 +1,26 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
cure (0.
|
5
|
-
|
6
|
-
|
4
|
+
cure (0.4.0)
|
5
|
+
artii (~> 2.1.2)
|
6
|
+
faker (~> 3.2.2)
|
7
|
+
rcsv (~> 0.3.1)
|
8
|
+
sequel (~> 5.74.0)
|
9
|
+
sqlite3 (~> 1.6.8)
|
10
|
+
terminal-table (~> 3.0.2)
|
7
11
|
|
8
12
|
GEM
|
9
13
|
remote: https://rubygems.org/
|
10
14
|
specs:
|
15
|
+
artii (2.1.2)
|
11
16
|
ast (2.4.2)
|
12
|
-
|
17
|
+
bigdecimal (3.1.5)
|
18
|
+
concurrent-ruby (1.2.2)
|
13
19
|
diff-lcs (1.4.4)
|
14
20
|
docile (1.4.0)
|
15
|
-
faker (2.
|
21
|
+
faker (3.2.2)
|
16
22
|
i18n (>= 1.8.11, < 2)
|
17
|
-
i18n (1.
|
23
|
+
i18n (1.14.1)
|
18
24
|
concurrent-ruby (~> 1.0)
|
19
25
|
parallel (1.21.0)
|
20
26
|
parser (3.0.2.0)
|
@@ -48,16 +54,28 @@ GEM
|
|
48
54
|
unicode-display_width (>= 1.4.0, < 3.0)
|
49
55
|
rubocop-ast (1.13.0)
|
50
56
|
parser (>= 3.0.1.1)
|
57
|
+
rubocop-performance (1.11.5)
|
58
|
+
rubocop (>= 1.7.0, < 2.0)
|
59
|
+
rubocop-ast (>= 0.4.0)
|
51
60
|
ruby-progressbar (1.11.0)
|
61
|
+
sequel (5.74.0)
|
62
|
+
bigdecimal
|
52
63
|
simplecov (0.21.2)
|
53
64
|
docile (~> 1.1)
|
54
65
|
simplecov-html (~> 0.11)
|
55
66
|
simplecov_json_formatter (~> 0.1)
|
56
67
|
simplecov-html (0.12.3)
|
57
68
|
simplecov_json_formatter (0.1.4)
|
69
|
+
sqlite3 (1.6.9-x86_64-linux)
|
70
|
+
standard (1.4.0)
|
71
|
+
rubocop (= 1.22.3)
|
72
|
+
rubocop-performance (= 1.11.5)
|
73
|
+
terminal-table (3.0.2)
|
74
|
+
unicode-display_width (>= 1.1.1, < 3)
|
58
75
|
unicode-display_width (2.1.0)
|
59
76
|
|
60
77
|
PLATFORMS
|
78
|
+
ruby
|
61
79
|
x86_64-linux
|
62
80
|
|
63
81
|
DEPENDENCIES
|
@@ -66,6 +84,7 @@ DEPENDENCIES
|
|
66
84
|
rspec (~> 3.0)
|
67
85
|
rubocop (~> 1.21)
|
68
86
|
simplecov
|
87
|
+
standard
|
69
88
|
|
70
89
|
BUNDLED WITH
|
71
90
|
2.3.13
|
data/README.md
CHANGED
@@ -3,111 +3,79 @@
|
|
3
3
|

|
4
4
|
[](https://badge.fury.io/rb/cure)
|
5
5
|
|
6
|
-
Cure
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
It has several key features:
|
11
|
-
- Operate on your data to build what you need.
|
12
|
-
- Files are taken through an `Extract -> Build -> Transform -> Export` pipeline.
|
13
|
-
- Extract parts of your file into named ranges to remove junk.
|
14
|
-
- Build (Add/Remove/Explode) columns - handy for files that may have JSON as a column value.
|
15
|
-
- Transform values:
|
16
|
-
- Define either full or regex match groups replacements.
|
17
|
-
- Choose from many strategies to replace anonymous data - random number sequences, GUIDs, placeholders, multipliers amongst many others.
|
18
|
-
- **Existing generated values are stored and recalled** so once a replacement is defined, it is kept around for other columns to use.
|
19
|
-
- For example, once a replacement **Account Number** is generated, any further use of that number sequence is other columns will be used, keeping data real(ish) and functional in a relational sense.
|
20
|
-
- Export into one (or many) files, in a selection of chosen formats (CSV at the moment, coming soon with JSON, Parquet).
|
21
|
-
|
22
|
-
## Use Cases
|
23
|
-
|
24
|
-
- Strip out personal data from a CSV that may be used for public demo.
|
25
|
-
- Extract specific parts of a CSV file and junk the rest.
|
26
|
-
- Explode JSON values into individual columns per key.
|
6
|
+
Cure provides a low-code solution for handling a wide range of tasks for importing, validating and manipulating one or
|
7
|
+
more CSV files. Unlike other tools, Cure doesn't assume standard CSV formatting and is designed to handle a wide range of
|
8
|
+
challenging scenarios.
|
27
9
|
|
28
|
-
|
10
|
+
The library provides optional hooks for each data processing pipeline phase in:
|
29
11
|
|
30
|
-
|
31
|
-
A candidate column entry provides the translations to be run on each column.
|
32
|
-
|
33
|
-
Please see example below.
|
34
|
-
```json
|
35
|
-
{
|
36
|
-
"column" : "identity/LineItemId",
|
37
|
-
"translations" : [{
|
38
|
-
"strategy" : {
|
39
|
-
"name": "full",
|
40
|
-
"options" : {}
|
41
|
-
},
|
42
|
-
"generator" : {
|
43
|
-
"name" : "character",
|
44
|
-
"options" : {
|
45
|
-
"length" : 52,
|
46
|
-
"types" : [
|
47
|
-
"lowercase", "number", "uppercase"
|
48
|
-
]
|
49
|
-
}
|
50
|
-
}
|
51
|
-
}]
|
52
|
-
}
|
53
|
-
```
|
12
|
+
`Sources -> Extract -> Validate -> Build -> Query -> Transform -> Export`
|
54
13
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
## Example
|
73
|
-
|
74
|
-
```json
|
75
|
-
{
|
76
|
-
"column" : "identity/ResourceId",
|
77
|
-
"translations" : [{
|
78
|
-
"strategy" : {
|
79
|
-
"name": "full",
|
80
|
-
"options" : {}
|
81
|
-
},
|
82
|
-
"generator" : {
|
83
|
-
"name" : "character",
|
84
|
-
"options" : {
|
85
|
-
"length" : 10,
|
86
|
-
"types" : [
|
87
|
-
"lowercase", "number"
|
88
|
-
]
|
89
|
-
}
|
90
|
-
}
|
91
|
-
}]
|
92
|
-
}
|
93
|
-
```
|
14
|
+
See below for a simple example that loads customer data from a single CSV, redacts the email records, and stores the
|
15
|
+
result in a new CSV.
|
16
|
+
|
17
|
+
```ruby
|
18
|
+
require "cure"
|
19
|
+
|
20
|
+
handler = Cure.init do
|
21
|
+
sources { csv :pathname, Pathname.new("customer_data.csv") }
|
22
|
+
|
23
|
+
extract { named_range at: "D2:G8" }
|
24
|
+
|
25
|
+
transform do
|
26
|
+
candidate column: "email" do
|
27
|
+
with_translation { replace("split", token: "@", index: 0).with("redact") }
|
28
|
+
with_translation { replace("split", token: "@", index: -1).with("redact") }
|
29
|
+
end
|
30
|
+
end
|
94
31
|
|
95
|
-
|
96
|
-
|
97
|
-
i-ae44e104ef1 => ddsf78ds56
|
32
|
+
export { csv file_name: "cust_transformed", directory: "/tmp/cure" }
|
33
|
+
end
|
98
34
|
|
99
|
-
|
100
|
-
# made up of lowercase letters and numbers
|
35
|
+
handler.run_export
|
101
36
|
|
102
|
-
|
37
|
+
# Input (customer_data.csv): Output (cust_transformed.csv):
|
38
|
+
#
|
39
|
+
# | id | email | | id | email |
|
40
|
+
# |----|------------------------| => |----|------------------------|
|
41
|
+
# | 1 | john.smith@gmail.com | | 1 | xxxxxxxxxx@xxxxx.com |
|
42
|
+
# | 2 | lean.davis@outlook.com | | 2 | xxxxxxxxxx@xxxxxxx.com |
|
43
|
+
|
44
|
+
```
|
45
|
+
|
46
|
+
Click this link to view the [documentation](docs/README.md), see a real world [example](http://www.williamthom.as/csv/ruby/2023/04/06/transforming-csvs-with-cure.html),
|
47
|
+
or see a longer list of [features](docs/about.md).
|
103
48
|
|
104
49
|
## Installation
|
105
50
|
|
51
|
+
### Requirements
|
52
|
+
|
53
|
+
- Ruby 3.0 or above
|
54
|
+
- SQLite3
|
55
|
+
|
106
56
|
Install it yourself as:
|
107
57
|
|
108
58
|
$ gem install cure
|
109
59
|
|
110
|
-
|
60
|
+
## Usage
|
61
|
+
|
62
|
+
### CLI
|
63
|
+
|
64
|
+
You can start a new Cure project using CLI using the following command:
|
65
|
+
|
66
|
+
$ cure new [name]
|
67
|
+
|
68
|
+
This will create a directory to house templates, input and output directories amongst others.
|
69
|
+
|
70
|
+
To perform a one-off run, you can do it manually via the CLI using the following command:
|
71
|
+
|
72
|
+
$ cure run -t template.rb -s source_file.csv
|
73
|
+
|
74
|
+
You can view help with the following command:
|
75
|
+
|
76
|
+
$ cure help
|
77
|
+
|
78
|
+
### Try it out
|
111
79
|
|
112
80
|
To quickly spin up a development environment, please use the Dockerfile provided. Run:
|
113
81
|
|
@@ -118,7 +86,7 @@ Please do not forget to mount any volumes which may have templates that you wish
|
|
118
86
|
|
119
87
|
Once set up and connected to your container, run:
|
120
88
|
|
121
|
-
$ cure -t
|
89
|
+
$ cure run -t template.rb -s source_file.csv
|
122
90
|
|
123
91
|
## Development
|
124
92
|
|
data/docs/README.md
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# Cure
|
2
|
+
|
3
|
+
### Cure Documentation
|
4
|
+
|
5
|
+
- [Metadata](metadata/main.md)
|
6
|
+
- [Sources](sources/main.md)
|
7
|
+
- [Extract](extract/main.md)
|
8
|
+
- [Validate](validate/main.md)
|
9
|
+
- [Build](builder/main.md)
|
10
|
+
- [Query](query/main.md)
|
11
|
+
- [Transform](transform/main.md)
|
12
|
+
- [Export](export/main.md)
|
13
|
+
|
14
|
+
Cure has several key features:
|
15
|
+
- A clean DSL to describe the operations that you want to do. This can be defined in code, or loaded from a file that
|
16
|
+
could be version controlled.
|
17
|
+
- Operate on your data to build what you need.
|
18
|
+
- Files are taken through an `Source -> Extract -> Validate -> Build -> Query -> Transform -> Export` pipeline.
|
19
|
+
- Each of these steps is optional.
|
20
|
+
- [Metadata](metadata/main.md) allows you to add some comments to your template. Will not impact functionality.
|
21
|
+
- [Sources](sources/main.md) are where you define the file(s) that you wish to operate on.
|
22
|
+
- [Extract](extract/main.md) parts of your file into named ranges to remove junk.
|
23
|
+
- [Validate](validate/main.md) that data fits your expectations.
|
24
|
+
- [Build](builder/main.md) (add, remove, rename, copy, explode) columns.
|
25
|
+
- [Query](query/main.md) your extracted data using SQLite to further control your desired data.
|
26
|
+
- [Transform](transform/main.md) values:
|
27
|
+
- Define either full, split, partials or regex match groups replacements.
|
28
|
+
- Choose from many strategies to replace data - random number sequences, GUIDs, placeholders, multipliers amongst many others.
|
29
|
+
- **Existing generated values are stored and recalled** so once a replacement is defined, it is kept around for other columns to use.
|
30
|
+
- For example, once a replacement **Account Number** is generated, any further use of that number sequence is other columns will be used, keeping data real(ish) and functional in a relational sense.
|
31
|
+
- [Export](export/main.md) into one (or many) files, in a selection of chosen formats, CSV (single or chunked files), or create a custom proc to do whatever you want.
|
32
|
+
|
33
|
+
Please see the [Examples](examples/examples.md) article in the examples directory for more information.
|
data/docs/about.md
ADDED
@@ -0,0 +1,219 @@
|
|
1
|
+
# Cure
|
2
|
+
|
3
|
+
Cure is a versatile tool designed to handle a wide range of tasks for importing or manipulating CSV data.
|
4
|
+
It may take time to get familiar with all the features, but once you do, it is capable of performing a wide
|
5
|
+
range of tasks.
|
6
|
+
|
7
|
+
Cure can be used as an end-to-end CSV importing tool, or for when you just want to validate, extract, merge,
|
8
|
+
clean, transform, remove, anonymize, replace, or manipulate tabular data. It operates in memory by default and
|
9
|
+
can be integrated into existing workflows or controlled via the CLI.
|
10
|
+
|
11
|
+
## Use Cases
|
12
|
+
|
13
|
+
Other CSV utils or importers often make assumptions that CSV data is nicely formatted tabular data. However, in the
|
14
|
+
real world you may get files don't follow a standard [header,row 1,row 2,row n] format. With Cure, you can load
|
15
|
+
specific parts of a file, or join multiple files together and treat them as one. See this
|
16
|
+
[blog post](http://www.williamthom.as/csv/ruby/2023/04/06/transforming-csvs-with-cure.html) for a detailed example.
|
17
|
+
|
18
|
+
Cure can be used for simple tasks like:
|
19
|
+
- Import data from a spreadsheet into a database.
|
20
|
+
- Split one CSV file into multiple files based on a filter (ex. M/F data in a single file into one M file and one F file).
|
21
|
+
- Change one 10,000 line CSV file into 10 1,000 line files.
|
22
|
+
- Extract specific parts of a CSV and discard the remaining data.
|
23
|
+
- Validate a CSV has the expected data against a spec.
|
24
|
+
- Fix data mistakes.
|
25
|
+
|
26
|
+
... and more complex ones like:
|
27
|
+
- Anonymize and transform personal data in a CSV to prepare it for a public demo environments.
|
28
|
+
- Perform complex transformations on values according to specific rules.
|
29
|
+
- Unpack JSON values into individual columns per key.
|
30
|
+
- Process large files sequentially while retaining variable history.
|
31
|
+
- Merge two or more CSV files (or parts thereof) together.
|
32
|
+
|
33
|
+
### In Code
|
34
|
+
Cure can be used as part of your existing application. It is configured using a DSL that can either be inline,
|
35
|
+
or as a file. Check out [docs](docs/README.md) for more information.
|
36
|
+
|
37
|
+
## When not to use
|
38
|
+
|
39
|
+
Cure processes CSV files as a whole. Some of its features require a complete parse of the file to extract the necessary
|
40
|
+
data before transforming it.
|
41
|
+
|
42
|
+
These features include:
|
43
|
+
|
44
|
+
- Variable extraction (for example, extracting a value from A1 and adding it to each row).
|
45
|
+
- Non-zero indexed headers (for example, taking values from rows 4 to 10 and using row 2 as the source header row).
|
46
|
+
- Expanding JSON fields into columns (for example, if row 1 has values [{"a":1, "b":2}], and row 2 has [{"c":3}], each
|
47
|
+
row needs columns A, B, C, but row 1 doesn't know that until row 2).
|
48
|
+
|
49
|
+
If you have large datasets of streamable CSV data, there are more efficient and performant tools available. However,
|
50
|
+
Cure makes it possible to perform more aggressive transformations, which may require more memory usage. If you still
|
51
|
+
want to use Cure to process large files, you can choose to persist the datastore to disk instead of in memory, which
|
52
|
+
may have a slight impact on performance.
|
53
|
+
|
54
|
+
## How it works
|
55
|
+
|
56
|
+
The library provides designated hooks for each distinct phase in the data processing pipeline
|
57
|
+
|
58
|
+
`Extract -> Validate -> Build -> Query -> Transform -> Export`
|
59
|
+
|
60
|
+
You can choose to opt in to as many or as few stages as needed, no steps are mandatory.
|
61
|
+
|
62
|
+
Cure operates by extracting complete CSV files or specific portions of them into user defined named ranges (one or more
|
63
|
+
cells of tabular data), which are subsequently inserted into SQLite tables. This allows for the ability to join or
|
64
|
+
manipulate rows with SQL, *if you need it*. With data segmented into separate named ranges, multiple transforms and
|
65
|
+
exports can be performed in a single pass.
|
66
|
+
|
67
|
+
## Examples
|
68
|
+
|
69
|
+
### Chunk CSV files
|
70
|
+
|
71
|
+
This is a simple example that takes a sheet and exports it into multiple sheets of 10,000 rows max.
|
72
|
+
|
73
|
+
```ruby
|
74
|
+
require "cure"
|
75
|
+
|
76
|
+
handler = Cure.init do
|
77
|
+
export do
|
78
|
+
chunk_csv file_name_prefix: "my_sheet", directory: "/tmp/cure", chunk_size: 10_000
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
handler.process(:path, "path/to/my_sheet.csv")
|
83
|
+
```
|
84
|
+
|
85
|
+
### Filter single file into multiple
|
86
|
+
|
87
|
+
This example takes in a sheet of male and female data and exports it into two files based on gender.
|
88
|
+
|
89
|
+
```ruby
|
90
|
+
require "cure"
|
91
|
+
|
92
|
+
handler = Cure.init do
|
93
|
+
extract do
|
94
|
+
named_range name: "male", at: -1
|
95
|
+
named_range name: "female", at: -1
|
96
|
+
end
|
97
|
+
|
98
|
+
query do
|
99
|
+
with named_range: "female", query: <<-SQL
|
100
|
+
SELECT
|
101
|
+
*
|
102
|
+
FROM female
|
103
|
+
WHERE
|
104
|
+
Sex = 'F' AND strftime('%Y', Date) > '2014'
|
105
|
+
ORDER BY Date DESC
|
106
|
+
SQL
|
107
|
+
|
108
|
+
with named_range: "male", query: <<-SQL
|
109
|
+
SELECT
|
110
|
+
*
|
111
|
+
FROM male
|
112
|
+
WHERE
|
113
|
+
Sex = 'M' AND strftime('%Y', Date) > '2014'
|
114
|
+
ORDER BY Date DESC
|
115
|
+
SQL
|
116
|
+
end
|
117
|
+
|
118
|
+
export do
|
119
|
+
csv file_name: "male", directory: "/tmp/cure", named_range: "male"
|
120
|
+
csv file_name: "female", directory: "/tmp/cure", named_range: "female"
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
handler.process(:path, "path/to/my_sheet.csv")
|
125
|
+
```
|
126
|
+
|
127
|
+
### Validate data
|
128
|
+
|
129
|
+
This example validates that a sheet has valid columns. It will throw an error if it isn't valid.
|
130
|
+
|
131
|
+
```ruby
|
132
|
+
require "cure"
|
133
|
+
|
134
|
+
handler = Cure.init do
|
135
|
+
validate do
|
136
|
+
candidate column: "rating", options: { fail_on_error: true } do
|
137
|
+
with_rule :not_null
|
138
|
+
with_rule :length, { min: 0, max: 5 }
|
139
|
+
end
|
140
|
+
|
141
|
+
candidate column: "phone_number", options: { fail_on_error: true } do
|
142
|
+
with_rule :custom, { proc: proc { |val| val =~ /^04\d{8}$/ } }
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
handler.process(:path, "path/to/my_sheet.csv")
|
148
|
+
```
|
149
|
+
|
150
|
+
### Transform data
|
151
|
+
|
152
|
+
This example anonymizes private data found in a cloud invoice. Note that when the existing account number is found
|
153
|
+
in any column, it is replaced with the same value, maintaining referential integrity whilst being anonymized.
|
154
|
+
|
155
|
+
You can see the [before](spec/cure/e2e/input/aws_billing_input.csv) and [after](spec/cure/e2e/output/aws_billing_output.csv) CSVs
|
156
|
+
made from this template by clicking on the links.
|
157
|
+
|
158
|
+
```ruby
|
159
|
+
require "cure"
|
160
|
+
|
161
|
+
handler = Cure.init do
|
162
|
+
build do
|
163
|
+
candidate do
|
164
|
+
whitelist options: {
|
165
|
+
columns: %w[
|
166
|
+
bill/BillingEntity
|
167
|
+
bill/PayerAccountId
|
168
|
+
bill/BillingPeriodStartDate
|
169
|
+
bill/BillingPeriodEndDate
|
170
|
+
lineItem/UsageAccountId
|
171
|
+
lineItem/LineItemType
|
172
|
+
lineItem/UsageStartDate
|
173
|
+
lineItem/UsageEndDate
|
174
|
+
lineItem/UsageType
|
175
|
+
lineItem/ResourceId
|
176
|
+
lineItem/ProductCode
|
177
|
+
lineItem/UsageAmount
|
178
|
+
lineItem/CurrencyCode
|
179
|
+
]
|
180
|
+
}
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
rot13_proc = proc { |source, _ctx|
|
185
|
+
source.gsub(/[^a-zA-Z0-9]/, '').tr('A-Za-z', 'N-ZA-Mn-za-m')
|
186
|
+
}
|
187
|
+
|
188
|
+
transform do
|
189
|
+
candidate column: "bill/PayerAccountId" do
|
190
|
+
with_translation { replace("full").with("placeholder", name: :account_number) }
|
191
|
+
end
|
192
|
+
|
193
|
+
candidate column: "lineItem/UsageAccountId" do
|
194
|
+
with_translation { replace("full").with("number", length: 10) }
|
195
|
+
end
|
196
|
+
|
197
|
+
candidate column: "lineItem/ResourceId", options: {ignore_empty: true} do
|
198
|
+
# If there is a match (i-[my-group]), replace just the match group with a hex string of 10 length
|
199
|
+
with_translation { replace("regex", regex_cg: "^i-(.*)").with("proc", execute: rot13_proc) }
|
200
|
+
|
201
|
+
# If the string contains the account number, replace with the account_number placeholder.
|
202
|
+
with_translation { replace("contain", match: "1234567890").with("placeholder", name: :account_number) }
|
203
|
+
|
204
|
+
# If no match is found, replace the whole match with a prefix hidden_ along with a random 10 char hex string
|
205
|
+
if_no_match { replace("full").with("proc", execute: rot13_proc) }
|
206
|
+
end
|
207
|
+
|
208
|
+
# Hardcoded values that we may wish to reference
|
209
|
+
place_holders({account_number: 987_654_321})
|
210
|
+
end
|
211
|
+
|
212
|
+
export do
|
213
|
+
terminal title: "Preview", limit_rows: 20
|
214
|
+
csv file_name: "aws", directory: "/tmp/cure"
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
handler.process(:path, "path/to/my_sheet.csv")
|
219
|
+
```
|
data/docs/builder/add.md
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
[... go back to build contents](main.md)
|
2
|
+
|
3
|
+
## Add
|
4
|
+
|
5
|
+
### What is it?
|
6
|
+
|
7
|
+
Add builder will add a new, empty column to the spreadsheet.
|
8
|
+
|
9
|
+
### Why would you need it?
|
10
|
+
|
11
|
+
As useless as a new empty column sounds, it can be used for a placeholder column to be used later. A common example
|
12
|
+
of this may be if you want to add a variable to each row. For example, at the top of a spreadsheet, you may have a
|
13
|
+
date, but you want to add that to each row.
|
14
|
+
|
15
|
+
### Full Configuration
|
16
|
+
|
17
|
+
```ruby
|
18
|
+
build do
|
19
|
+
candidate(column: "new_column", named_range: "mysheet") { add options: { default_value: "-" } }
|
20
|
+
end
|
21
|
+
```
|
22
|
+
- `column`: represents the column name, mandatory.
|
23
|
+
- `named_range`: specifies the named range holding the column, if no named range has been set you can leave it blank.
|
24
|
+
- `options`:
|
25
|
+
- `value`: not mandatory, if provided will add to the initial row value.
|
26
|
+
|
27
|
+
### Example
|
28
|
+
|
29
|
+
```ruby
|
30
|
+
build do
|
31
|
+
candidate(column: "col_b") { add }
|
32
|
+
end
|
33
|
+
```
|
34
|
+
|
35
|
+
Original input:
|
36
|
+
```
|
37
|
+
+-------+
|
38
|
+
| col_a |
|
39
|
+
+-------+
|
40
|
+
| a |
|
41
|
+
+-------+
|
42
|
+
```
|
43
|
+
|
44
|
+
changes to:
|
45
|
+
|
46
|
+
```
|
47
|
+
+-------+-------+
|
48
|
+
| col_a | col_b |
|
49
|
+
+-------+-------+
|
50
|
+
| a | |
|
51
|
+
+-------+-------+
|
52
|
+
```
|