@danielarndt0/cnpj-db-loader 2.4.0-beta.1 → 2.4.0-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -10,7 +10,7 @@ This version focuses on the real loading workflow:
10
10
  - check, download, retry, clean, and inspect the latest Federal Revenue CNPJ monthly ZIP archives from the public share
11
11
  - extract Receita Federal ZIP archives
12
12
  - validate an extracted tree
13
- - sanitize validated files before import to remove known low-level byte issues
13
+ - sanitize validated files into clean UTF-8 before import, removing NUL bytes, invalid bytes and problematic control characters
14
14
  - print or generate final, staging, or combined SQL schemas
15
15
  - configure and test the default PostgreSQL URL
16
16
  - import validated dataset files into PostgreSQL with:
@@ -51,8 +51,8 @@ cnpj-db-loader schema generate --profile full
51
51
  cnpj-db-loader import ./downloads/<reference>/sanitized --load-batch-size 500 --materialize-batch-size 50000 --verbose-progress
52
52
 
53
53
  # Optional hybrid path for PostgreSQL direct loading
54
- cnpj-db-loader postgres generate-script ./downloads/<reference>/sanitized --output ./downloads/<reference>/postgres-direct --force
55
- psql "postgres://postgres:postgres@localhost:5432/cnpj" -f ./downloads/<reference>/postgres-direct/import-postgres-direct.sql
54
+ cnpj-db-loader postgres generate-script ./downloads/<reference>/sanitized --output ./downloads/<reference>/postgres-direct --source-encoding UTF8 --transaction-mode phase --force
55
+ psql -d "postgres://postgres:postgres@localhost:5432/cnpj" -f ./downloads/<reference>/postgres-direct/import-postgres-direct.sql
56
56
  ```
57
57
 
58
58
  ## Stable commands
@@ -67,7 +67,7 @@ cnpj-db-loader federal-revenue sync [reference] [--reference <yyyy-mm>] [--curre
67
67
  cnpj-db-loader inspect <input>
68
68
  cnpj-db-loader extract <input> [--output <path>]
69
69
  cnpj-db-loader validate <input>
70
- cnpj-db-loader sanitize <input> [--output <path>] [--dataset <name>] [-f]
70
+ cnpj-db-loader sanitize <input> [--output <path>] [--dataset <name>] [--source-encoding <encoding>] [-f]
71
71
  cnpj-db-loader schema print [--profile <profile>]
72
72
  cnpj-db-loader schema generate [--name <name>] [--output <path>] [--profile <profile>]
73
73
  cnpj-db-loader database config set <url>
@@ -78,7 +78,7 @@ cnpj-db-loader database cleanup staging [--db-url <url>] [--dataset <name>] [--v
78
78
  cnpj-db-loader database cleanup materialized [--db-url <url>] [--dataset <name>] [--force]
79
79
  cnpj-db-loader database cleanup checkpoints [--db-url <url>] [--phase <phase>] [--dataset <name>] [--validated-path <path>] [--plan-id <id>] [--force]
80
80
  cnpj-db-loader database cleanup plans [--db-url <url>] [--validated-path <path>] [--plan-id <id>] [--force]
81
- cnpj-db-loader postgres generate-script <input> [--output <path>] [--dataset <name>] [--script-name <name>] [--source-encoding <encoding>] [-f]
81
+ cnpj-db-loader postgres generate-script <input> [--output <path>] [--dataset <name>] [--script-name <name>] [--source-encoding <encoding>] [--transaction-mode <mode>] [--include <items>] [--skip-indexes] [--skip-analyze] [-f]
82
82
  cnpj-db-loader postgres export-csv <input> [--output <path>] [--dataset <name>] [--script-name <name>] [-f]
83
83
  cnpj-db-loader import <input> [--db-url <url>] [--dataset <name>] [--load-batch-size <size>] [--materialize-batch-size <size>] [--verbose-progress] [-f]
84
84
  cnpj-db-loader import load <input> [--db-url <url>] [--dataset <name>] [--load-batch-size <size>] [--verbose-progress] [-f]
@@ -95,11 +95,11 @@ For local benchmarks or controlled full loads, the CLI can now generate a direct
95
95
 
96
96
  ```bash
97
97
  cnpj-db-loader sanitize ./downloads/<reference>/extracted
98
- cnpj-db-loader postgres generate-script ./downloads/<reference>/sanitized --output ./downloads/<reference>/postgres-direct --force
99
- psql "postgres://postgres:postgres@localhost:5432/cnpj" -f ./downloads/<reference>/postgres-direct/import-postgres-direct.sql
98
+ cnpj-db-loader postgres generate-script ./downloads/<reference>/sanitized --output ./downloads/<reference>/postgres-direct --source-encoding UTF8 --transaction-mode phase --force
99
+ psql -d "postgres://postgres:postgres@localhost:5432/cnpj" -f ./downloads/<reference>/postgres-direct/import-postgres-direct.sql
100
100
  ```
101
101
 
102
- This path keeps download, extraction, validation and sanitization inside the loader, then lets PostgreSQL load the sanitized Receita files directly through `\copy`, convert values into staging tables and materialize the final tables with set-based SQL. The standard `import` command remains the safest path when checkpoint resume and quarantine recovery are required.
102
+ This path keeps download, extraction, validation and robust UTF-8 sanitization inside the loader, then lets PostgreSQL load the sanitized Receita files directly through `\copy`, convert values into staging tables and materialize the final tables with set-based SQL. The standard `import` command remains the safest path when checkpoint resume and quarantine recovery are required.
103
103
 
104
104
  ## Logs
105
105