cure 0.1.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +13 -3
  3. data/.tool-versions +1 -0
  4. data/Dockerfile +1 -1
  5. data/Gemfile +1 -0
  6. data/Gemfile.lock +25 -6
  7. data/README.md +61 -93
  8. data/docs/README.md +33 -0
  9. data/docs/about.md +219 -0
  10. data/docs/builder/add.md +52 -0
  11. data/docs/builder/black_white_list.md +83 -0
  12. data/docs/builder/copy.md +48 -0
  13. data/docs/builder/explode.md +70 -0
  14. data/docs/builder/main.md +43 -0
  15. data/docs/builder/remove.md +46 -0
  16. data/docs/examples/examples.md +164 -0
  17. data/docs/export/main.md +37 -0
  18. data/docs/extract/main.md +89 -0
  19. data/docs/metadata/main.md +29 -0
  20. data/docs/query/main.md +45 -0
  21. data/docs/sources/main.md +36 -0
  22. data/docs/transform/main.md +53 -0
  23. data/docs/validate/main.md +42 -0
  24. data/exe/cure +12 -41
  25. data/exe/cure.old +59 -0
  26. data/lib/cure/builder/base_builder.rb +151 -0
  27. data/lib/cure/builder/candidate.rb +56 -0
  28. data/lib/cure/cli/command.rb +105 -0
  29. data/lib/cure/cli/generate_command.rb +54 -0
  30. data/lib/cure/cli/new_command.rb +52 -0
  31. data/lib/cure/cli/run_command.rb +19 -0
  32. data/lib/cure/cli/templates/README.md.erb +1 -0
  33. data/lib/cure/cli/templates/gemfile.erb +5 -0
  34. data/lib/cure/cli/templates/gitignore.erb +181 -0
  35. data/lib/cure/cli/templates/new_template.rb.erb +31 -0
  36. data/lib/cure/cli/templates/tool-versions.erb +1 -0
  37. data/lib/cure/config.rb +142 -18
  38. data/lib/cure/coordinator.rb +61 -25
  39. data/lib/cure/database.rb +191 -0
  40. data/lib/cure/dsl/builder.rb +26 -0
  41. data/lib/cure/dsl/exporters.rb +45 -0
  42. data/lib/cure/dsl/extraction.rb +60 -0
  43. data/lib/cure/dsl/metadata.rb +33 -0
  44. data/lib/cure/dsl/queries.rb +36 -0
  45. data/lib/cure/dsl/source_files.rb +36 -0
  46. data/lib/cure/dsl/template.rb +131 -0
  47. data/lib/cure/dsl/transformations.rb +95 -0
  48. data/lib/cure/dsl/validator.rb +22 -0
  49. data/lib/cure/export/base_processor.rb +194 -0
  50. data/lib/cure/export/manager.rb +24 -0
  51. data/lib/cure/extract/base_processor.rb +47 -0
  52. data/lib/cure/extract/csv_lookup.rb +14 -3
  53. data/lib/cure/extract/extractor.rb +41 -84
  54. data/lib/cure/extract/filter.rb +118 -0
  55. data/lib/cure/extract/named_range.rb +94 -0
  56. data/lib/cure/extract/named_range_processor.rb +128 -0
  57. data/lib/cure/extract/variable.rb +25 -0
  58. data/lib/cure/extract/variable_processor.rb +57 -0
  59. data/lib/cure/generator/base_generator.rb +14 -4
  60. data/lib/cure/generator/case_generator.rb +10 -3
  61. data/lib/cure/generator/character_generator.rb +9 -3
  62. data/lib/cure/generator/erb_generator.rb +21 -0
  63. data/lib/cure/generator/eval_generator.rb +34 -0
  64. data/lib/cure/generator/faker_generator.rb +7 -1
  65. data/lib/cure/generator/guid_generator.rb +7 -2
  66. data/lib/cure/generator/hex_generator.rb +6 -1
  67. data/lib/cure/generator/imports.rb +4 -0
  68. data/lib/cure/generator/number_generator.rb +6 -1
  69. data/lib/cure/generator/placeholder_generator.rb +7 -1
  70. data/lib/cure/generator/proc_generator.rb +21 -0
  71. data/lib/cure/generator/redact_generator.rb +9 -3
  72. data/lib/cure/generator/static_generator.rb +21 -0
  73. data/lib/cure/generator/variable_generator.rb +11 -5
  74. data/lib/cure/helpers/file_helpers.rb +12 -2
  75. data/lib/cure/helpers/object_helpers.rb +5 -17
  76. data/lib/cure/helpers/perf_helpers.rb +30 -0
  77. data/lib/cure/helpers/string.rb +54 -0
  78. data/lib/cure/launcher.rb +125 -0
  79. data/lib/cure/log.rb +7 -0
  80. data/lib/cure/planner.rb +136 -0
  81. data/lib/cure/strategy/append_strategy.rb +4 -0
  82. data/lib/cure/strategy/base_strategy.rb +19 -44
  83. data/lib/cure/strategy/contain_strategy.rb +51 -0
  84. data/lib/cure/strategy/end_with_strategy.rb +7 -1
  85. data/lib/cure/strategy/full_strategy.rb +4 -0
  86. data/lib/cure/strategy/history/history_cache.rb +82 -0
  87. data/lib/cure/strategy/imports.rb +2 -0
  88. data/lib/cure/strategy/match_strategy.rb +7 -2
  89. data/lib/cure/strategy/prepend_strategy.rb +28 -0
  90. data/lib/cure/strategy/regex_strategy.rb +7 -1
  91. data/lib/cure/strategy/split_strategy.rb +8 -3
  92. data/lib/cure/strategy/start_with_strategy.rb +7 -1
  93. data/lib/cure/transformation/candidate.rb +32 -35
  94. data/lib/cure/transformation/transform.rb +22 -56
  95. data/lib/cure/validator/base_rule.rb +78 -0
  96. data/lib/cure/validator/candidate.rb +54 -0
  97. data/lib/cure/validator/manager.rb +21 -0
  98. data/lib/cure/validators.rb +3 -3
  99. data/lib/cure/version.rb +1 -1
  100. data/lib/cure.rb +19 -11
  101. data/templates/dsl_example.rb +48 -0
  102. data/templates/empty_template.rb +31 -0
  103. metadata +132 -21
  104. data/lib/cure/export/exporter.rb +0 -74
  105. data/lib/cure/extract/builder.rb +0 -27
  106. data/lib/cure/main.rb +0 -72
  107. data/lib/cure/template/dispatch.rb +0 -30
  108. data/lib/cure/template/extraction.rb +0 -38
  109. data/lib/cure/template/template.rb +0 -28
  110. data/lib/cure/template/transformations.rb +0 -26
  111. data/templates/aws_cur_template.json +0 -145
  112. data/templates/example_template.json +0 -54
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 29e7c915346b45c1407208f6ba1810ece68004c8d7dd9ffb8eb2b5dcca903b35
4
- data.tar.gz: c356c649bc4a902fcb347e9e16f2299f4c8a7c0db576a7fc8242d080b575920d
3
+ metadata.gz: 78d9b4b8ba9b29e7e299811d971a41f3600cbf7df4f1e17729b083f9573f1b49
4
+ data.tar.gz: f9fc2bfbc4ce3dfbb7769bc4af7cd5af2ed10c72e0d236887787c4a63a6a2d34
5
5
  SHA512:
6
- metadata.gz: 5999a52c084656af0a6a18ba68884aec3de825be9f84f5c67cdf771031af414789aaebf24dfc9fed52f6a76b9a9495081f0f4e1c0f872927fbe34c271fa86e62
7
- data.tar.gz: 458962e9905380ef0e5832ef9e2ae2da1ede47f448992314bceba95c997c2af3bb4d6f2b92e1f23e906563c77ad1fbcd37d171d75732cf29c15b6b9b0e05e5bc
6
+ metadata.gz: eccf1c880d1c04653364621f3d2136c9bb049e8b8be07e31ed2f763fedb7ceb4ae4b92ecb1a86baf8314989e0bfc52d51846c5461a782bdb757c0cc13f337992
7
+ data.tar.gz: d84dcc4031ff49cac80877478df86f9a276743f1815ac5386a46bd408788c6d4ef29e1edfab2648ca0d15c89925282cd5b5f076e2813cf7ad863628fe418c0ef
data/.rubocop.yml CHANGED
@@ -1,5 +1,7 @@
1
1
  AllCops:
2
- TargetRubyVersion: 2.6
2
+ TargetRubyVersion: 3.2
3
+ Exclude:
4
+ - 'spec/**/*'
3
5
 
4
6
  Style/StringLiterals:
5
7
  Enabled: true
@@ -12,10 +14,13 @@ Style/StringLiteralsInInterpolation:
12
14
  Layout/LineLength:
13
15
  Max: 120
14
16
 
17
+ Metrics/BlockLength:
18
+ Max: 40
19
+
15
20
  # Too short methods lead to extraction of single-use methods, which can make
16
21
  # the code easier to read (by naming things), but can also clutter the class
17
22
  Metrics/MethodLength:
18
- Max: 20
23
+ Max: 40
19
24
 
20
25
  # The guiding principle of classes is SRP, SRP can't be accurately measured by LoC
21
26
  Metrics/ClassLength:
@@ -132,4 +137,9 @@ Lint/Debugger:
132
137
 
133
138
  # Style preference
134
139
  Style/MethodDefParentheses:
135
- Enabled: false
140
+ Enabled: false
141
+
142
+ Layout/FirstHashElementIndentation:
143
+ Enabled: true
144
+ EnforcedStyle: consistent
145
+ IndentationWidth: ~
data/.tool-versions ADDED
@@ -0,0 +1 @@
1
+ ruby 3.2.1
data/Dockerfile CHANGED
@@ -1,4 +1,4 @@
1
- FROM ruby:3.0.2
1
+ FROM ruby:3.1.4
2
2
  RUN apt-get update && apt-get install -y \
3
3
  build-essential
4
4
 
data/Gemfile CHANGED
@@ -10,3 +10,4 @@ gem "rake", "~> 13.0"
10
10
  gem "rspec", "~> 3.0"
11
11
 
12
12
  gem "rubocop", "~> 1.21"
13
+ gem "standard", group: "development", require: false
data/Gemfile.lock CHANGED
@@ -1,20 +1,26 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- cure (0.1.2)
5
- faker
6
- rcsv
4
+ cure (0.4.0)
5
+ artii (~> 2.1.2)
6
+ faker (~> 3.2.2)
7
+ rcsv (~> 0.3.1)
8
+ sequel (~> 5.74.0)
9
+ sqlite3 (~> 1.6.8)
10
+ terminal-table (~> 3.0.2)
7
11
 
8
12
  GEM
9
13
  remote: https://rubygems.org/
10
14
  specs:
15
+ artii (2.1.2)
11
16
  ast (2.4.2)
12
- concurrent-ruby (1.1.10)
17
+ bigdecimal (3.1.5)
18
+ concurrent-ruby (1.2.2)
13
19
  diff-lcs (1.4.4)
14
20
  docile (1.4.0)
15
- faker (2.21.0)
21
+ faker (3.2.2)
16
22
  i18n (>= 1.8.11, < 2)
17
- i18n (1.10.0)
23
+ i18n (1.14.1)
18
24
  concurrent-ruby (~> 1.0)
19
25
  parallel (1.21.0)
20
26
  parser (3.0.2.0)
@@ -48,16 +54,28 @@ GEM
48
54
  unicode-display_width (>= 1.4.0, < 3.0)
49
55
  rubocop-ast (1.13.0)
50
56
  parser (>= 3.0.1.1)
57
+ rubocop-performance (1.11.5)
58
+ rubocop (>= 1.7.0, < 2.0)
59
+ rubocop-ast (>= 0.4.0)
51
60
  ruby-progressbar (1.11.0)
61
+ sequel (5.74.0)
62
+ bigdecimal
52
63
  simplecov (0.21.2)
53
64
  docile (~> 1.1)
54
65
  simplecov-html (~> 0.11)
55
66
  simplecov_json_formatter (~> 0.1)
56
67
  simplecov-html (0.12.3)
57
68
  simplecov_json_formatter (0.1.4)
69
+ sqlite3 (1.6.9-x86_64-linux)
70
+ standard (1.4.0)
71
+ rubocop (= 1.22.3)
72
+ rubocop-performance (= 1.11.5)
73
+ terminal-table (3.0.2)
74
+ unicode-display_width (>= 1.1.1, < 3)
58
75
  unicode-display_width (2.1.0)
59
76
 
60
77
  PLATFORMS
78
+ ruby
61
79
  x86_64-linux
62
80
 
63
81
  DEPENDENCIES
@@ -66,6 +84,7 @@ DEPENDENCIES
66
84
  rspec (~> 3.0)
67
85
  rubocop (~> 1.21)
68
86
  simplecov
87
+ standard
69
88
 
70
89
  BUNDLED WITH
71
90
  2.3.13
data/README.md CHANGED
@@ -3,111 +3,79 @@
3
3
  ![run tests](https://github.com/williamthom-as/cure/actions/workflows/rspec.yml/badge.svg)
4
4
  [![Gem Version](https://badge.fury.io/rb/cure.svg)](https://badge.fury.io/rb/cure)
5
5
 
6
- Cure is a simple tool to **extract/clean/transform/remove/redact/anonymize** and **replace** information in a spreadsheet.
7
- It has been written to anonymize private cloud billing data for use in public demo environments. Since then, it has grown to
8
- additional processing capabilities that can take a CSV from junk to workable data.
9
-
10
- It has several key features:
11
- - Operate on your data to build what you need.
12
- - Files are taken through an `Extract -> Build -> Transform -> Export` pipeline.
13
- - Extract parts of your file into named ranges to remove junk.
14
- - Build (Add/Remove/Explode) columns - handy for files that may have JSON as a column value.
15
- - Transform values:
16
- - Define either full or regex match groups replacements.
17
- - Choose from many strategies to replace anonymous data - random number sequences, GUIDs, placeholders, multipliers amongst many others.
18
- - **Existing generated values are stored and recalled** so once a replacement is defined, it is kept around for other columns to use.
19
- - For example, once a replacement **Account Number** is generated, any further use of that number sequence is other columns will be used, keeping data real(ish) and functional in a relational sense.
20
- - Export into one (or many) files, in a selection of chosen formats (CSV at the moment, coming soon with JSON, Parquet).
21
-
22
- ## Use Cases
23
-
24
- - Strip out personal data from a CSV that may be used for public demo.
25
- - Extract specific parts of a CSV file and junk the rest.
26
- - Explode JSON values into individual columns per key.
6
+ Cure provides a low-code solution for handling a wide range of tasks for importing, validating and manipulating one or
7
+ more CSV files. Unlike other tools, Cure doesn't assume standard CSV formatting and is designed to handle a wide range of
8
+ challenging scenarios.
27
9
 
28
- ## Usage
10
+ The library provides optional hooks for each data processing pipeline phase in:
29
11
 
30
- Cure requires two things, a **template** (or rules) file. This is a descriptive file that provides the translations required on each column.
31
- A candidate column entry provides the translations to be run on each column.
32
-
33
- Please see example below.
34
- ```json
35
- {
36
- "column" : "identity/LineItemId",
37
- "translations" : [{
38
- "strategy" : {
39
- "name": "full",
40
- "options" : {}
41
- },
42
- "generator" : {
43
- "name" : "character",
44
- "options" : {
45
- "length" : 52,
46
- "types" : [
47
- "lowercase", "number", "uppercase"
48
- ]
49
- }
50
- }
51
- }]
52
- }
53
- ```
12
+ `Sources -> Extract -> Validate -> Build -> Query -> Transform -> Export`
54
13
 
55
- A **translation** is made up of a strategy and generator.
56
-
57
- **Strategies** are the means of selecting the *value* to change. You may choose from:
58
- - **Full replacement**: replaces the full entry.
59
- - **Regex replacement**: can replace either the match group (partial), or full record *if* there is a match.
60
- - **Includes replacement**: can replace either the matched substring, or full record *if* there is a match.
61
- - **StartWith replacement**: can replace either the starts with substring, or full record *if* there is a match.
62
- - **EndWith replacement**: can replace either the end with substring, or full record *if* there is a match.
63
-
64
- **Generators** are the way a replacement value is generated. You may choose from:
65
- - Random number generator
66
- - Random Hex numbers
67
- - Random character strings
68
- - Placeholder lookups
69
- - Redaction strings
70
- - Removal (empty string)
71
-
72
- ## Example
73
-
74
- ```json
75
- {
76
- "column" : "identity/ResourceId",
77
- "translations" : [{
78
- "strategy" : {
79
- "name": "full",
80
- "options" : {}
81
- },
82
- "generator" : {
83
- "name" : "character",
84
- "options" : {
85
- "length" : 10,
86
- "types" : [
87
- "lowercase", "number"
88
- ]
89
- }
90
- }
91
- }]
92
- }
93
- ```
14
+ See below for a simple example that loads customer data from a single CSV, redacts the email records, and stores the
15
+ result in a new CSV.
16
+
17
+ ```ruby
18
+ require "cure"
19
+
20
+ handler = Cure.init do
21
+ sources { csv :pathname, Pathname.new("customer_data.csv") }
22
+
23
+ extract { named_range at: "D2:G8" }
24
+
25
+ transform do
26
+ candidate column: "email" do
27
+ with_translation { replace("split", token: "@", index: 0).with("redact") }
28
+ with_translation { replace("split", token: "@", index: -1).with("redact") }
29
+ end
30
+ end
94
31
 
95
- The above example would translate a source value from column **identity/ResourceId** as follows:
96
-
97
- i-ae44e104ef1 => ddsf78ds56
32
+ export { csv file_name: "cust_transformed", directory: "/tmp/cure" }
33
+ end
98
34
 
99
- # A full replacement, with a random generated 10 character string
100
- # made up of lowercase letters and numbers
35
+ handler.run_export
101
36
 
102
- You can see more of these examples in the Examples folder.
37
+ # Input (customer_data.csv): Output (cust_transformed.csv):
38
+ #
39
+ # | id | email | | id | email |
40
+ # |----|------------------------| => |----|------------------------|
41
+ # | 1 | john.smith@gmail.com | | 1 | xxxxxxxxxx@xxxxx.com |
42
+ # | 2 | lean.davis@outlook.com | | 2 | xxxxxxxxxx@xxxxxxx.com |
43
+
44
+ ```
45
+
46
+ Click this link to view the [documentation](docs/README.md), see a real world [example](http://www.williamthom.as/csv/ruby/2023/04/06/transforming-csvs-with-cure.html),
47
+ or see a longer list of [features](docs/about.md).
103
48
 
104
49
  ## Installation
105
50
 
51
+ ### Requirements
52
+
53
+ - Ruby 3.0 or above
54
+ - SQLite3
55
+
106
56
  Install it yourself as:
107
57
 
108
58
  $ gem install cure
109
59
 
110
- ### Getting started *quickly*
60
+ ## Usage
61
+
62
+ ### CLI
63
+
64
+ You can start a new Cure project using CLI using the following command:
65
+
66
+ $ cure new [name]
67
+
68
+ This will create a directory to house templates, input and output directories amongst others.
69
+
70
+ To perform a one-off run, you can do it manually via the CLI using the following command:
71
+
72
+ $ cure run -t template.rb -s source_file.csv
73
+
74
+ You can view help with the following command:
75
+
76
+ $ cure help
77
+
78
+ ### Try it out
111
79
 
112
80
  To quickly spin up a development environment, please use the Dockerfile provided. Run:
113
81
 
@@ -118,7 +86,7 @@ Please do not forget to mount any volumes which may have templates that you wish
118
86
 
119
87
  Once set up and connected to your container, run:
120
88
 
121
- $ cure -t /file/path/to/template.json -s /file/path/to/source_file.csv -o /my/output/folder
89
+ $ cure run -t template.rb -s source_file.csv
122
90
 
123
91
  ## Development
124
92
 
data/docs/README.md ADDED
@@ -0,0 +1,33 @@
1
+ # Cure
2
+
3
+ ### Cure Documentation
4
+
5
+ - [Metadata](metadata/main.md)
6
+ - [Sources](sources/main.md)
7
+ - [Extract](extract/main.md)
8
+ - [Validate](validate/main.md)
9
+ - [Build](builder/main.md)
10
+ - [Query](query/main.md)
11
+ - [Transform](transform/main.md)
12
+ - [Export](export/main.md)
13
+
14
+ Cure has several key features:
15
+ - A clean DSL to describe the operations that you want to do. This can be defined in code, or loaded from a file that
16
+ could be version controlled.
17
+ - Operate on your data to build what you need.
18
+ - Files are taken through an `Source -> Extract -> Validate -> Build -> Query -> Transform -> Export` pipeline.
19
+ - Each of these steps is optional.
20
+ - [Metadata](metadata/main.md) allows you to add some comments to your template. Will not impact functionality.
21
+ - [Sources](sources/main.md) are where you define the file(s) that you wish to operate on.
22
+ - [Extract](extract/main.md) parts of your file into named ranges to remove junk.
23
+ - [Validate](validate/main.md) that data fits your expectations.
24
+ - [Build](builder/main.md) (add, remove, rename, copy, explode) columns.
25
+ - [Query](query/main.md) your extracted data using SQLite to further control your desired data.
26
+ - [Transform](transform/main.md) values:
27
+ - Define either full, split, partials or regex match groups replacements.
28
+ - Choose from many strategies to replace data - random number sequences, GUIDs, placeholders, multipliers amongst many others.
29
+ - **Existing generated values are stored and recalled** so once a replacement is defined, it is kept around for other columns to use.
30
+ - For example, once a replacement **Account Number** is generated, any further use of that number sequence is other columns will be used, keeping data real(ish) and functional in a relational sense.
31
+ - [Export](export/main.md) into one (or many) files, in a selection of chosen formats, CSV (single or chunked files), or create a custom proc to do whatever you want.
32
+
33
+ Please see the [Examples](examples/examples.md) article in the examples directory for more information.
data/docs/about.md ADDED
@@ -0,0 +1,219 @@
1
+ # Cure
2
+
3
+ Cure is a versatile tool designed to handle a wide range of tasks for importing or manipulating CSV data.
4
+ It may take time to get familiar with all the features, but once you do, it is capable of performing a wide
5
+ range of tasks.
6
+
7
+ Cure can be used as an end-to-end CSV importing tool, or for when you just want to validate, extract, merge,
8
+ clean, transform, remove, anonymize, replace, or manipulate tabular data. It operates in memory by default and
9
+ can be integrated into existing workflows or controlled via the CLI.
10
+
11
+ ## Use Cases
12
+
13
+ Other CSV utils or importers often make assumptions that CSV data is nicely formatted tabular data. However, in the
14
+ real world you may get files don't follow a standard [header,row 1,row 2,row n] format. With Cure, you can load
15
+ specific parts of a file, or join multiple files together and treat them as one. See this
16
+ [blog post](http://www.williamthom.as/csv/ruby/2023/04/06/transforming-csvs-with-cure.html) for a detailed example.
17
+
18
+ Cure can be used for simple tasks like:
19
+ - Import data from a spreadsheet into a database.
20
+ - Split one CSV file into multiple files based on a filter (ex. M/F data in a single file into one M file and one F file).
21
+ - Change one 10,000 line CSV file into 10 1,000 line files.
22
+ - Extract specific parts of a CSV and discard the remaining data.
23
+ - Validate a CSV has the expected data against a spec.
24
+ - Fix data mistakes.
25
+
26
+ ... and more complex ones like:
27
+ - Anonymize and transform personal data in a CSV to prepare it for a public demo environments.
28
+ - Perform complex transformations on values according to specific rules.
29
+ - Unpack JSON values into individual columns per key.
30
+ - Process large files sequentially while retaining variable history.
31
+ - Merge two or more CSV files (or parts thereof) together.
32
+
33
+ ### In Code
34
+ Cure can be used as part of your existing application. It is configured using a DSL that can either be inline,
35
+ or as a file. Check out [docs](docs/README.md) for more information.
36
+
37
+ ## When not to use
38
+
39
+ Cure processes CSV files as a whole. Some of its features require a complete parse of the file to extract the necessary
40
+ data before transforming it.
41
+
42
+ These features include:
43
+
44
+ - Variable extraction (for example, extracting a value from A1 and adding it to each row).
45
+ - Non-zero indexed headers (for example, taking values from rows 4 to 10 and using row 2 as the source header row).
46
+ - Expanding JSON fields into columns (for example, if row 1 has values [{"a":1, "b":2}], and row 2 has [{"c":3}], each
47
+ row needs columns A, B, C, but row 1 doesn't know that until row 2).
48
+
49
+ If you have large datasets of streamable CSV data, there are more efficient and performant tools available. However,
50
+ Cure makes it possible to perform more aggressive transformations, which may require more memory usage. If you still
51
+ want to use Cure to process large files, you can choose to persist the datastore to disk instead of in memory, which
52
+ may have a slight impact on performance.
53
+
54
+ ## How it works
55
+
56
+ The library provides designated hooks for each distinct phase in the data processing pipeline
57
+
58
+ `Extract -> Validate -> Build -> Query -> Transform -> Export`
59
+
60
+ You can choose to opt in to as many or as few stages as needed, no steps are mandatory.
61
+
62
+ Cure operates by extracting complete CSV files or specific portions of them into user defined named ranges (one or more
63
+ cells of tabular data), which are subsequently inserted into SQLite tables. This allows for the ability to join or
64
+ manipulate rows with SQL, *if you need it*. With data segmented into separate named ranges, multiple transforms and
65
+ exports can be performed in a single pass.
66
+
67
+ ## Examples
68
+
69
+ ### Chunk CSV files
70
+
71
+ This is a simple example that takes a sheet and exports it into multiple sheets of 10,000 rows max.
72
+
73
+ ```ruby
74
+ require "cure"
75
+
76
+ handler = Cure.init do
77
+ export do
78
+ chunk_csv file_name_prefix: "my_sheet", directory: "/tmp/cure", chunk_size: 10_000
79
+ end
80
+ end
81
+
82
+ handler.process(:path, "path/to/my_sheet.csv")
83
+ ```
84
+
85
+ ### Filter single file into multiple
86
+
87
+ This example takes in a sheet of male and female data and exports it into two files based on gender.
88
+
89
+ ```ruby
90
+ require "cure"
91
+
92
+ handler = Cure.init do
93
+ extract do
94
+ named_range name: "male", at: -1
95
+ named_range name: "female", at: -1
96
+ end
97
+
98
+ query do
99
+ with named_range: "female", query: <<-SQL
100
+ SELECT
101
+ *
102
+ FROM female
103
+ WHERE
104
+ Sex = 'F' AND strftime('%Y', Date) > '2014'
105
+ ORDER BY Date DESC
106
+ SQL
107
+
108
+ with named_range: "male", query: <<-SQL
109
+ SELECT
110
+ *
111
+ FROM male
112
+ WHERE
113
+ Sex = 'M' AND strftime('%Y', Date) > '2014'
114
+ ORDER BY Date DESC
115
+ SQL
116
+ end
117
+
118
+ export do
119
+ csv file_name: "male", directory: "/tmp/cure", named_range: "male"
120
+ csv file_name: "female", directory: "/tmp/cure", named_range: "female"
121
+ end
122
+ end
123
+
124
+ handler.process(:path, "path/to/my_sheet.csv")
125
+ ```
126
+
127
+ ### Validate data
128
+
129
+ This example validates that a sheet has valid columns. It will throw an error if it isn't valid.
130
+
131
+ ```ruby
132
+ require "cure"
133
+
134
+ handler = Cure.init do
135
+ validate do
136
+ candidate column: "rating", options: { fail_on_error: true } do
137
+ with_rule :not_null
138
+ with_rule :length, { min: 0, max: 5 }
139
+ end
140
+
141
+ candidate column: "phone_number", options: { fail_on_error: true } do
142
+ with_rule :custom, { proc: proc { |val| val =~ /^04\d{8}$/ } }
143
+ end
144
+ end
145
+ end
146
+
147
+ handler.process(:path, "path/to/my_sheet.csv")
148
+ ```
149
+
150
+ ### Transform data
151
+
152
+ This example anonymizes private data found in a cloud invoice. Note that when the existing account number is found
153
+ in any column, it is replaced with the same value, maintaining referential integrity whilst being anonymized.
154
+
155
+ You can see the [before](spec/cure/e2e/input/aws_billing_input.csv) and [after](spec/cure/e2e/output/aws_billing_output.csv) CSVs
156
+ made from this template by clicking on the links.
157
+
158
+ ```ruby
159
+ require "cure"
160
+
161
+ handler = Cure.init do
162
+ build do
163
+ candidate do
164
+ whitelist options: {
165
+ columns: %w[
166
+ bill/BillingEntity
167
+ bill/PayerAccountId
168
+ bill/BillingPeriodStartDate
169
+ bill/BillingPeriodEndDate
170
+ lineItem/UsageAccountId
171
+ lineItem/LineItemType
172
+ lineItem/UsageStartDate
173
+ lineItem/UsageEndDate
174
+ lineItem/UsageType
175
+ lineItem/ResourceId
176
+ lineItem/ProductCode
177
+ lineItem/UsageAmount
178
+ lineItem/CurrencyCode
179
+ ]
180
+ }
181
+ end
182
+ end
183
+
184
+ rot13_proc = proc { |source, _ctx|
185
+ source.gsub(/[^a-zA-Z0-9]/, '').tr('A-Za-z', 'N-ZA-Mn-za-m')
186
+ }
187
+
188
+ transform do
189
+ candidate column: "bill/PayerAccountId" do
190
+ with_translation { replace("full").with("placeholder", name: :account_number) }
191
+ end
192
+
193
+ candidate column: "lineItem/UsageAccountId" do
194
+ with_translation { replace("full").with("number", length: 10) }
195
+ end
196
+
197
+ candidate column: "lineItem/ResourceId", options: {ignore_empty: true} do
198
+ # If there is a match (i-[my-group]), replace just the match group with a hex string of 10 length
199
+ with_translation { replace("regex", regex_cg: "^i-(.*)").with("proc", execute: rot13_proc) }
200
+
201
+ # If the string contains the account number, replace with the account_number placeholder.
202
+ with_translation { replace("contain", match: "1234567890").with("placeholder", name: :account_number) }
203
+
204
+ # If no match is found, replace the whole match with a prefix hidden_ along with a random 10 char hex string
205
+ if_no_match { replace("full").with("proc", execute: rot13_proc) }
206
+ end
207
+
208
+ # Hardcoded values that we may wish to reference
209
+ place_holders({account_number: 987_654_321})
210
+ end
211
+
212
+ export do
213
+ terminal title: "Preview", limit_rows: 20
214
+ csv file_name: "aws", directory: "/tmp/cure"
215
+ end
216
+ end
217
+
218
+ handler.process(:path, "path/to/my_sheet.csv")
219
+ ```
@@ -0,0 +1,52 @@
1
+ [... go back to build contents](main.md)
2
+
3
+ ## Add
4
+
5
+ ### What is it?
6
+
7
+ Add builder will add a new, empty column to the spreadsheet.
8
+
9
+ ### Why would you need it?
10
+
11
+ As useless as a new empty column sounds, it can be used for a placeholder column to be used later. A common example
12
+ of this may be if you want to add a variable to each row. For example, at the top of a spreadsheet, you may have a
13
+ date, but you want to add that to each row.
14
+
15
+ ### Full Configuration
16
+
17
+ ```ruby
18
+ build do
19
+ candidate(column: "new_column", named_range: "mysheet") { add options: { default_value: "-" } }
20
+ end
21
+ ```
22
+ - `column`: represents the column name, mandatory.
23
+ - `named_range`: specifies the named range holding the column, if no named range has been set you can leave it blank.
24
+ - `options`:
25
+ - `value`: not mandatory, if provided will add to the initial row value.
26
+
27
+ ### Example
28
+
29
+ ```ruby
30
+ build do
31
+ candidate(column: "col_b") { add }
32
+ end
33
+ ```
34
+
35
+ Original input:
36
+ ```
37
+ +-------+
38
+ | col_a |
39
+ +-------+
40
+ | a |
41
+ +-------+
42
+ ```
43
+
44
+ changes to:
45
+
46
+ ```
47
+ +-------+-------+
48
+ | col_a | col_b |
49
+ +-------+-------+
50
+ | a | |
51
+ +-------+-------+
52
+ ```