nm-tool-forge 0.2.3__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/PKG-INFO +53 -17
  2. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/README.md +98 -62
  3. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/pyproject.toml +1 -1
  4. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/csvchunking/__init__.py +1 -1
  5. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/csvchunking/chunker.py +6 -6
  6. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/csvchunking/cli.py +9 -9
  7. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/loganalysis/__init__.py +1 -1
  8. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/nm_tool_forge.egg-info/PKG-INFO +53 -17
  9. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/tests/test_csvchunking.py +5 -5
  10. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/LICENSE +0 -0
  11. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/setup.cfg +0 -0
  12. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/csvchunking/__main__.py +0 -0
  13. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/loganalysis/__main__.py +0 -0
  14. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/loganalysis/analysis.py +0 -0
  15. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/loganalysis/cli.py +0 -0
  16. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/loganalysis/constants.py +0 -0
  17. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/loganalysis/converters.py +0 -0
  18. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/loganalysis/csv_export.py +0 -0
  19. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/loganalysis/encoding.py +0 -0
  20. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/loganalysis/filesystem.py +0 -0
  21. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/loganalysis/models.py +0 -0
  22. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/loganalysis/normalization.py +0 -0
  23. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/loganalysis/parsing.py +0 -0
  24. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/loganalysis/report_html.py +0 -0
  25. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/loganalysis/report_markdown.py +0 -0
  26. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/loganalysis/report_models.py +0 -0
  27. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/loganalysis/report_pdf.py +0 -0
  28. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/loganalysis/selftest.py +0 -0
  29. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/nm_tool_forge.egg-info/SOURCES.txt +0 -0
  30. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/nm_tool_forge.egg-info/dependency_links.txt +0 -0
  31. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/nm_tool_forge.egg-info/entry_points.txt +0 -0
  32. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/nm_tool_forge.egg-info/requires.txt +0 -0
  33. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/src/nm_tool_forge.egg-info/top_level.txt +0 -0
  34. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/tests/test_analysis.py +0 -0
  35. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/tests/test_normalization.py +0 -0
  36. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/tests/test_parsing.py +0 -0
  37. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/tests/test_report_html.py +0 -0
  38. {nm_tool_forge-0.2.3 → nm_tool_forge-0.2.4}/tests/test_report_markdown.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nm-tool-forge
3
- Version: 0.2.3
3
+ Version: 0.2.4
4
4
  Summary: Analyze MigMan log files and generate aggregated CSV, Markdown, HTML, and optional PDF reports.
5
5
  Author-email: Stefan Ewald <s.ew@outlook.de>
6
6
  License-Expression: MIT
@@ -31,7 +31,7 @@ Dynamic: license-file
31
31
 
32
32
  # nm-tool-forge
33
33
 
34
- `nm-tool-forge` analyzes MigMan text log files with severity tokens such as `INFO`, `ERROR`, and `WARNING` and generates aggregated CSV, Markdown, HTML, and optional PDF reports.
34
+ `nm-tool-forge` analyzes MigMan text log files with severity tokens such as `INFO`, `ERROR`, and `WARNING` and generates aggregated CSV, Markdown, HTML, and optional PDF reports. The package also includes `csvchunking`, a small helper for splitting large CSV files into migration-friendly chunks.
35
35
 
36
36
  The project uses a package-ready `src` layout. The legacy `log_analysis.py` file remains available as a thin compatibility entry point for older local setups.
37
37
 
@@ -43,6 +43,7 @@ The project uses a package-ready `src` layout. The legacy `log_analysis.py` file
43
43
  - Generate Markdown summary reports
44
44
  - Optionally convert reports to HTML and PDF
45
45
  - Keep a backup copy of analyzed log files
46
+ - Split large CSV files into numbered chunks while preserving the header row
46
47
  - Run built-in self-tests from the CLI
47
48
 
48
49
  ## Installation
@@ -61,12 +62,14 @@ python -m pip install .[pdf,dev]
61
62
 
62
63
  ## Command-line usage
63
64
 
64
- After installation, both entry points are available:
65
+ After installation, the CLI entry points are available:
65
66
 
66
67
  ```powershell
67
68
  python -m loganalysis --help
69
+ python -m csvchunking --help
68
70
  loganalysis --help
69
71
  nm-tool-forge --help
72
+ csvchunking --help
70
73
  ```
71
74
 
72
75
  Typical analysis run:
@@ -89,29 +92,30 @@ python -m loganalysis --self-test
89
92
 
90
93
  Legacy compatibility call:
91
94
 
95
+ ```powershell
96
+ python .\log_analysis.py --convert
97
+ ```
92
98
 
93
- ## Release process
94
-
95
- To publish a new release, always test on TestPyPI first, then upload to PyPI only after successful Conda/Smoke-Tests:
96
-
97
- ```bash
98
- export TWINE_USERNAME="__token__"
99
- export TWINE_PASSWORD="pypi-..."
99
+ CSV chunking run:
100
100
 
101
- bash scripts/release_testpypi.sh --bump patch
102
- bash scripts/release_pypi.sh --yes
101
+ ```powershell
102
+ csvchunking "data\large_export.csv" --chunk-size 5000
103
103
  ```
104
104
 
105
- **Hinweise:**
106
- - Erst TestPyPI ausführen und testen, dann final nach PyPI hochladen.
107
- - Versionen auf PyPI können nicht überschrieben oder erneut verwendet werden.
105
+ The command creates an output directory next to the input file named after the CSV stem. For example, `data\large_export.csv` is split into files such as `data\large_export\large_export_01.csv`, `data\large_export\large_export_02.csv`, and so on.
106
+
107
+ CSV chunking with an explicit encoding:
108
108
 
109
109
  ```powershell
110
- python .\log_analysis.py --convert
110
+ python -m csvchunking "data\large_export.csv" --chunk-size 5000 --encoding utf-8-sig
111
111
  ```
112
112
 
113
+ Each chunk contains the original header row plus up to `--chunk-size` data rows. The delimiter is detected automatically; if detection fails, semicolon-separated CSV is used.
114
+
113
115
  ## Supported CLI options
114
116
 
117
+ Log analysis options:
118
+
115
119
  - `--logs-dir`
116
120
  - `--out-dir`
117
121
  - `--backup-dir`
@@ -119,6 +123,28 @@ python .\log_analysis.py --convert
119
123
  - `--convert`
120
124
  - `--self-test`
121
125
 
126
+ CSV chunking options:
127
+
128
+ - `input_file` - path to the CSV file to split
129
+ - `--chunk-size` - required number of data rows per output file; must be greater than zero
130
+ - `--encoding` - input and output encoding; defaults to `utf-8-sig`
131
+
132
+ ## Release process
133
+
134
+ To publish a new release, always test on TestPyPI first, then upload to PyPI only after successful Conda smoke tests:
135
+
136
+ ```bash
137
+ export TWINE_USERNAME="__token__"
138
+ export TWINE_PASSWORD="pypi-..."
139
+
140
+ bash scripts/release_testpypi.sh --bump patch
141
+ bash scripts/release_pypi.sh --yes
142
+ ```
143
+
144
+ **Notes:**
145
+ - Run and verify the TestPyPI release first, then upload the final package to PyPI.
146
+ - PyPI versions cannot be overwritten or reused.
147
+
122
148
  ## Library usage
123
149
 
124
150
  ```python
@@ -130,6 +156,7 @@ from loganalysis import (
130
156
  iter_logical_entries,
131
157
  normalize_message,
132
158
  )
159
+ from csvchunking import split_csv
133
160
 
134
161
  result = analyze_file(Path("logs/app.txt"))
135
162
  print(result["norm_counts"])
@@ -146,14 +173,21 @@ convert_report_md_to_html_pdf(
146
173
  Path("log_analyse_out/report.html"),
147
174
  Path("log_analyse_out/report.pdf"),
148
175
  )
176
+
177
+ chunk_result = split_csv(Path("data/large_export.csv"), chunk_size=5000)
178
+ print(chunk_result.output_dir)
179
+ print(chunk_result.output_files)
149
180
  ```
150
181
 
182
+ `split_csv()` returns a `ChunkResult` with the input file, output directory, chunk size, processed data-row count, created file count, and generated output file paths.
183
+
151
184
  ## Project structure
152
185
 
153
186
  ```text
154
187
  .
155
188
  ├─ pyproject.toml
156
189
  ├─ src/loganalysis/
190
+ ├─ src/csvchunking/
157
191
  ├─ tests/
158
192
  ├─ docs/
159
193
  └─ log_analysis.py
@@ -168,7 +202,9 @@ Important modules:
168
202
  - `report_html.py` - HTML/CSS rendering
169
203
  - `report_pdf.py` - PDF engine selection and fallback handling
170
204
  - `converters.py` - Markdown-to-HTML/PDF conversion
171
- - `cli.py` - command-line entry point
205
+ - `loganalysis/cli.py` - log analysis command-line entry point
206
+ - `csvchunking/chunker.py` - CSV splitting logic and `ChunkResult`
207
+ - `csvchunking/cli.py` - CSV chunking command-line entry point
172
208
 
173
209
  ## HTML/PDF conversion
174
210
 
@@ -1,6 +1,6 @@
1
1
  # nm-tool-forge
2
2
 
3
- `nm-tool-forge` analyzes MigMan text log files with severity tokens such as `INFO`, `ERROR`, and `WARNING` and generates aggregated CSV, Markdown, HTML, and optional PDF reports.
3
+ `nm-tool-forge` analyzes MigMan text log files with severity tokens such as `INFO`, `ERROR`, and `WARNING` and generates aggregated CSV, Markdown, HTML, and optional PDF reports. The package also includes `csvchunking`, a small helper for splitting large CSV files into migration-friendly chunks.
4
4
 
5
5
  The project uses a package-ready `src` layout. The legacy `log_analysis.py` file remains available as a thin compatibility entry point for older local setups.
6
6
 
@@ -9,10 +9,11 @@ The project uses a package-ready `src` layout. The legacy `log_analysis.py` file
9
9
  - Parse logical log entries from multi-line text logs
10
10
  - Normalize recurring error patterns for better aggregation
11
11
  - Generate aggregated CSV reports
12
- - Generate Markdown summary reports
13
- - Optionally convert reports to HTML and PDF
14
- - Keep a backup copy of analyzed log files
15
- - Run built-in self-tests from the CLI
12
+ - Generate Markdown summary reports
13
+ - Optionally convert reports to HTML and PDF
14
+ - Keep a backup copy of analyzed log files
15
+ - Split large CSV files into numbered chunks while preserving the header row
16
+ - Run built-in self-tests from the CLI
16
17
 
17
18
  ## Installation
18
19
 
@@ -30,13 +31,15 @@ python -m pip install .[pdf,dev]
30
31
 
31
32
  ## Command-line usage
32
33
 
33
- After installation, both entry points are available:
34
-
35
- ```powershell
36
- python -m loganalysis --help
37
- loganalysis --help
38
- nm-tool-forge --help
39
- ```
34
+ After installation, the CLI entry points are available:
35
+
36
+ ```powershell
37
+ python -m loganalysis --help
38
+ python -m csvchunking --help
39
+ loganalysis --help
40
+ nm-tool-forge --help
41
+ csvchunking --help
42
+ ```
40
43
 
41
44
  Typical analysis run:
42
45
 
@@ -50,18 +53,54 @@ Analysis with HTML/PDF conversion:
50
53
  nm-tool-forge --logs-dir logs --out-dir log_analyse_out --convert
51
54
  ```
52
55
 
53
- Self-test mode:
54
-
55
- ```powershell
56
- python -m loganalysis --self-test
57
- ```
58
-
59
- Legacy compatibility call:
60
-
61
-
62
- ## Release process
63
-
64
- To publish a new release, always test on TestPyPI first, then upload to PyPI only after successful Conda/Smoke-Tests:
56
+ Self-test mode:
57
+
58
+ ```powershell
59
+ python -m loganalysis --self-test
60
+ ```
61
+
62
+ Legacy compatibility call:
63
+
64
+ ```powershell
65
+ python .\log_analysis.py --convert
66
+ ```
67
+
68
+ CSV chunking run:
69
+
70
+ ```powershell
71
+ csvchunking "data\large_export.csv" --chunk-size 5000
72
+ ```
73
+
74
+ The command creates an output directory next to the input file named after the CSV stem. For example, `data\large_export.csv` is split into files such as `data\large_export\large_export_01.csv`, `data\large_export\large_export_02.csv`, and so on.
75
+
76
+ CSV chunking with an explicit encoding:
77
+
78
+ ```powershell
79
+ python -m csvchunking "data\large_export.csv" --chunk-size 5000 --encoding utf-8-sig
80
+ ```
81
+
82
+ Each chunk contains the original header row plus up to `--chunk-size` data rows. The delimiter is detected automatically; if detection fails, semicolon-separated CSV is used.
83
+
84
+ ## Supported CLI options
85
+
86
+ Log analysis options:
87
+
88
+ - `--logs-dir`
89
+ - `--out-dir`
90
+ - `--backup-dir`
91
+ - `--top-examples`
92
+ - `--convert`
93
+ - `--self-test`
94
+
95
+ CSV chunking options:
96
+
97
+ - `input_file` - path to the CSV file to split
98
+ - `--chunk-size` - required number of data rows per output file; must be greater than zero
99
+ - `--encoding` - input and output encoding; defaults to `utf-8-sig`
100
+
101
+ ## Release process
102
+
103
+ To publish a new release, always test on TestPyPI first, then upload to PyPI only after successful Conda smoke tests:
65
104
 
66
105
  ```bash
67
106
  export TWINE_USERNAME="__token__"
@@ -71,37 +110,25 @@ bash scripts/release_testpypi.sh --bump patch
71
110
  bash scripts/release_pypi.sh --yes
72
111
  ```
73
112
 
74
- **Hinweise:**
75
- - Erst TestPyPI ausführen und testen, dann final nach PyPI hochladen.
76
- - Versionen auf PyPI können nicht überschrieben oder erneut verwendet werden.
77
-
78
- ```powershell
79
- python .\log_analysis.py --convert
80
- ```
81
-
82
- ## Supported CLI options
83
-
84
- - `--logs-dir`
85
- - `--out-dir`
86
- - `--backup-dir`
87
- - `--top-examples`
88
- - `--convert`
89
- - `--self-test`
90
-
91
- ## Library usage
92
-
93
- ```python
113
+ **Notes:**
114
+ - Run and verify the TestPyPI release first, then upload the final package to PyPI.
115
+ - PyPI versions cannot be overwritten or reused.
116
+
117
+ ## Library usage
118
+
119
+ ```python
94
120
  from pathlib import Path
95
121
 
96
122
  from loganalysis import (
97
123
  analyze_file,
98
124
  convert_report_md_to_html_pdf,
99
- iter_logical_entries,
100
- normalize_message,
101
- )
102
-
103
- result = analyze_file(Path("logs/app.txt"))
104
- print(result["norm_counts"])
125
+ iter_logical_entries,
126
+ normalize_message,
127
+ )
128
+ from csvchunking import split_csv
129
+
130
+ result = analyze_file(Path("logs/app.txt"))
131
+ print(result["norm_counts"])
105
132
 
106
133
  print(normalize_message(
107
134
  'Conversion: X =3100110. 138 The record was not found in table "Teile".'
@@ -112,20 +139,27 @@ for entry in iter_logical_entries(Path("logs/app.txt")):
112
139
 
113
140
  convert_report_md_to_html_pdf(
114
141
  Path("log_analyse_out/report.md"),
115
- Path("log_analyse_out/report.html"),
116
- Path("log_analyse_out/report.pdf"),
117
- )
118
- ```
142
+ Path("log_analyse_out/report.html"),
143
+ Path("log_analyse_out/report.pdf"),
144
+ )
145
+
146
+ chunk_result = split_csv(Path("data/large_export.csv"), chunk_size=5000)
147
+ print(chunk_result.output_dir)
148
+ print(chunk_result.output_files)
149
+ ```
150
+
151
+ `split_csv()` returns a `ChunkResult` with the input file, output directory, chunk size, processed data-row count, created file count, and generated output file paths.
119
152
 
120
153
  ## Project structure
121
154
 
122
155
  ```text
123
156
  .
124
- ├─ pyproject.toml
125
- ├─ src/loganalysis/
126
- ├─ tests/
127
- ├─ docs/
128
- └─ log_analysis.py
157
+ ├─ pyproject.toml
158
+ ├─ src/loganalysis/
159
+ ├─ src/csvchunking/
160
+ ├─ tests/
161
+ ├─ docs/
162
+ └─ log_analysis.py
129
163
  ```
130
164
 
131
165
  Important modules:
@@ -135,9 +169,11 @@ Important modules:
135
169
  - `normalization.py` - message normalization
136
170
  - `report_markdown.py` - Markdown report model and rendering
137
171
  - `report_html.py` - HTML/CSS rendering
138
- - `report_pdf.py` - PDF engine selection and fallback handling
139
- - `converters.py` - Markdown-to-HTML/PDF conversion
140
- - `cli.py` - command-line entry point
172
+ - `report_pdf.py` - PDF engine selection and fallback handling
173
+ - `converters.py` - Markdown-to-HTML/PDF conversion
174
+ - `loganalysis/cli.py` - log analysis command-line entry point
175
+ - `csvchunking/chunker.py` - CSV splitting logic and `ChunkResult`
176
+ - `csvchunking/cli.py` - CSV chunking command-line entry point
141
177
 
142
178
  ## HTML/PDF conversion
143
179
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "nm-tool-forge"
7
- version = "0.2.3"
7
+ version = "0.2.4"
8
8
  description = "Analyze MigMan log files and generate aggregated CSV, Markdown, HTML, and optional PDF reports."
9
9
  readme = { file = "README.md", content-type = "text/markdown" }
10
10
  requires-python = ">=3.10"
@@ -1,4 +1,4 @@
1
1
  from .chunker import ChunkResult, split_csv
2
2
 
3
3
  __all__ = ["ChunkResult", "split_csv"]
4
- __version__ = "0.2.3"
4
+ __version__ = "0.2.4"
@@ -18,16 +18,16 @@ def split_csv(
18
18
  chunk_size: int,
19
19
  encoding: str = "utf-8-sig",
20
20
  ) -> ChunkResult:
21
- if not Path(input_file).is_file():
22
- raise FileNotFoundError(f"Eingabedatei nicht gefunden: {input_file}")
23
- if chunk_size <= 0:
24
- raise ValueError("chunk_size muss > 0 sein")
21
+ if not Path(input_file).is_file():
22
+ raise FileNotFoundError(f"Input file not found: {input_file}")
23
+ if chunk_size <= 0:
24
+ raise ValueError("chunk_size must be greater than 0")
25
25
 
26
26
  input_file = Path(input_file)
27
27
  output_dir = input_file.parent / input_file.stem
28
28
  output_dir.mkdir(exist_ok=True)
29
29
 
30
- # Delimiter automatisch erkennen
30
+ # Detect the delimiter automatically.
31
31
  with open(input_file, encoding=encoding, newline="") as f:
32
32
  sample = f.read(4096)
33
33
  f.seek(0)
@@ -41,7 +41,7 @@ def split_csv(
41
41
  try:
42
42
  header = next(reader)
43
43
  except StopIteration as exc:
44
- raise ValueError("Eingabedatei ist leer.") from exc
44
+ raise ValueError("Input file is empty.") from exc
45
45
  chunk = []
46
46
  file_count = 0
47
47
  data_rows = 0
@@ -7,22 +7,22 @@ from .chunker import split_csv
7
7
 
8
8
  def main() -> None:
9
9
  parser = argparse.ArgumentParser(
10
- description="Teilt eine große CSV-Datei in kleinere Chunks mit Header.",
10
+ description="Split a large CSV file into smaller chunks with a header row.",
11
11
  )
12
- parser.add_argument("input_file", help="Pfad zur CSV-Datei")
12
+ parser.add_argument("input_file", help="Path to the CSV file")
13
13
  parser.add_argument(
14
14
  "--chunk-size",
15
15
  type=int,
16
16
  required=True,
17
- help="Anzahl Datenzeilen pro Ausgabedatei, muss > 0 sein",
17
+ help="Number of data rows per output file; must be greater than 0",
18
18
  )
19
- parser.add_argument("--encoding", default="utf-8-sig", help="Encoding für Ein- und Ausgabe (Standard: utf-8-sig)")
19
+ parser.add_argument("--encoding", default="utf-8-sig", help="Input and output encoding (Default: utf-8-sig)")
20
20
  args = parser.parse_args()
21
- try:
22
- result = split_csv(Path(args.input_file), args.chunk_size, encoding=args.encoding)
23
- except Exception as e:
24
- print(f"Fehler: {e}", file=sys.stderr)
25
- sys.exit(1)
21
+ try:
22
+ result = split_csv(Path(args.input_file), args.chunk_size, encoding=args.encoding)
23
+ except Exception as e:
24
+ print(f"Error: {e}", file=sys.stderr)
25
+ sys.exit(1)
26
26
  print("CSV chunking completed.")
27
27
  print(f"- Input: {result.input_file}")
28
28
  print(f"- Output directory: {result.output_dir}")
@@ -13,4 +13,4 @@ __all__ = [
13
13
  "run_analysis",
14
14
  ]
15
15
 
16
- __version__ = "0.2.3"
16
+ __version__ = "0.2.4"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nm-tool-forge
3
- Version: 0.2.3
3
+ Version: 0.2.4
4
4
  Summary: Analyze MigMan log files and generate aggregated CSV, Markdown, HTML, and optional PDF reports.
5
5
  Author-email: Stefan Ewald <s.ew@outlook.de>
6
6
  License-Expression: MIT
@@ -31,7 +31,7 @@ Dynamic: license-file
31
31
 
32
32
  # nm-tool-forge
33
33
 
34
- `nm-tool-forge` analyzes MigMan text log files with severity tokens such as `INFO`, `ERROR`, and `WARNING` and generates aggregated CSV, Markdown, HTML, and optional PDF reports.
34
+ `nm-tool-forge` analyzes MigMan text log files with severity tokens such as `INFO`, `ERROR`, and `WARNING` and generates aggregated CSV, Markdown, HTML, and optional PDF reports. The package also includes `csvchunking`, a small helper for splitting large CSV files into migration-friendly chunks.
35
35
 
36
36
  The project uses a package-ready `src` layout. The legacy `log_analysis.py` file remains available as a thin compatibility entry point for older local setups.
37
37
 
@@ -43,6 +43,7 @@ The project uses a package-ready `src` layout. The legacy `log_analysis.py` file
43
43
  - Generate Markdown summary reports
44
44
  - Optionally convert reports to HTML and PDF
45
45
  - Keep a backup copy of analyzed log files
46
+ - Split large CSV files into numbered chunks while preserving the header row
46
47
  - Run built-in self-tests from the CLI
47
48
 
48
49
  ## Installation
@@ -61,12 +62,14 @@ python -m pip install .[pdf,dev]
61
62
 
62
63
  ## Command-line usage
63
64
 
64
- After installation, both entry points are available:
65
+ After installation, the CLI entry points are available:
65
66
 
66
67
  ```powershell
67
68
  python -m loganalysis --help
69
+ python -m csvchunking --help
68
70
  loganalysis --help
69
71
  nm-tool-forge --help
72
+ csvchunking --help
70
73
  ```
71
74
 
72
75
  Typical analysis run:
@@ -89,29 +92,30 @@ python -m loganalysis --self-test
89
92
 
90
93
  Legacy compatibility call:
91
94
 
95
+ ```powershell
96
+ python .\log_analysis.py --convert
97
+ ```
92
98
 
93
- ## Release process
94
-
95
- To publish a new release, always test on TestPyPI first, then upload to PyPI only after successful Conda/Smoke-Tests:
96
-
97
- ```bash
98
- export TWINE_USERNAME="__token__"
99
- export TWINE_PASSWORD="pypi-..."
99
+ CSV chunking run:
100
100
 
101
- bash scripts/release_testpypi.sh --bump patch
102
- bash scripts/release_pypi.sh --yes
101
+ ```powershell
102
+ csvchunking "data\large_export.csv" --chunk-size 5000
103
103
  ```
104
104
 
105
- **Hinweise:**
106
- - Erst TestPyPI ausführen und testen, dann final nach PyPI hochladen.
107
- - Versionen auf PyPI können nicht überschrieben oder erneut verwendet werden.
105
+ The command creates an output directory next to the input file named after the CSV stem. For example, `data\large_export.csv` is split into files such as `data\large_export\large_export_01.csv`, `data\large_export\large_export_02.csv`, and so on.
106
+
107
+ CSV chunking with an explicit encoding:
108
108
 
109
109
  ```powershell
110
- python .\log_analysis.py --convert
110
+ python -m csvchunking "data\large_export.csv" --chunk-size 5000 --encoding utf-8-sig
111
111
  ```
112
112
 
113
+ Each chunk contains the original header row plus up to `--chunk-size` data rows. The delimiter is detected automatically; if detection fails, semicolon-separated CSV is used.
114
+
113
115
  ## Supported CLI options
114
116
 
117
+ Log analysis options:
118
+
115
119
  - `--logs-dir`
116
120
  - `--out-dir`
117
121
  - `--backup-dir`
@@ -119,6 +123,28 @@ python .\log_analysis.py --convert
119
123
  - `--convert`
120
124
  - `--self-test`
121
125
 
126
+ CSV chunking options:
127
+
128
+ - `input_file` - path to the CSV file to split
129
+ - `--chunk-size` - required number of data rows per output file; must be greater than zero
130
+ - `--encoding` - input and output encoding; defaults to `utf-8-sig`
131
+
132
+ ## Release process
133
+
134
+ To publish a new release, always test on TestPyPI first, then upload to PyPI only after successful Conda smoke tests:
135
+
136
+ ```bash
137
+ export TWINE_USERNAME="__token__"
138
+ export TWINE_PASSWORD="pypi-..."
139
+
140
+ bash scripts/release_testpypi.sh --bump patch
141
+ bash scripts/release_pypi.sh --yes
142
+ ```
143
+
144
+ **Notes:**
145
+ - Run and verify the TestPyPI release first, then upload the final package to PyPI.
146
+ - PyPI versions cannot be overwritten or reused.
147
+
122
148
  ## Library usage
123
149
 
124
150
  ```python
@@ -130,6 +156,7 @@ from loganalysis import (
130
156
  iter_logical_entries,
131
157
  normalize_message,
132
158
  )
159
+ from csvchunking import split_csv
133
160
 
134
161
  result = analyze_file(Path("logs/app.txt"))
135
162
  print(result["norm_counts"])
@@ -146,14 +173,21 @@ convert_report_md_to_html_pdf(
146
173
  Path("log_analyse_out/report.html"),
147
174
  Path("log_analyse_out/report.pdf"),
148
175
  )
176
+
177
+ chunk_result = split_csv(Path("data/large_export.csv"), chunk_size=5000)
178
+ print(chunk_result.output_dir)
179
+ print(chunk_result.output_files)
149
180
  ```
150
181
 
182
+ `split_csv()` returns a `ChunkResult` with the input file, output directory, chunk size, processed data-row count, created file count, and generated output file paths.
183
+
151
184
  ## Project structure
152
185
 
153
186
  ```text
154
187
  .
155
188
  ├─ pyproject.toml
156
189
  ├─ src/loganalysis/
190
+ ├─ src/csvchunking/
157
191
  ├─ tests/
158
192
  ├─ docs/
159
193
  └─ log_analysis.py
@@ -168,7 +202,9 @@ Important modules:
168
202
  - `report_html.py` - HTML/CSS rendering
169
203
  - `report_pdf.py` - PDF engine selection and fallback handling
170
204
  - `converters.py` - Markdown-to-HTML/PDF conversion
171
- - `cli.py` - command-line entry point
205
+ - `loganalysis/cli.py` - log analysis command-line entry point
206
+ - `csvchunking/chunker.py` - CSV splitting logic and `ChunkResult`
207
+ - `csvchunking/cli.py` - CSV chunking command-line entry point
172
208
 
173
209
  ## HTML/PDF conversion
174
210
 
@@ -12,7 +12,7 @@ def make_csv(tmp_path, name, header, rows, encoding="utf-8-sig", delimiter=";"):
12
12
  return file
13
13
 
14
14
 
15
- def test_normale_aufteilung(tmp_path):
15
+ def test_regular_split(tmp_path):
16
16
  header = ["col1", "col2"]
17
17
  rows = [["A", "1"], ["B", "2"], ["C", "3"], ["D", "4"], ["E", "5"]]
18
18
  file = make_csv(tmp_path, "sample.csv", header, rows)
@@ -27,7 +27,7 @@ def test_normale_aufteilung(tmp_path):
27
27
  assert (result.output_dir / "sample_03.csv").exists()
28
28
 
29
29
 
30
- def test_header_in_jeder_datei(tmp_path):
30
+ def test_header_in_each_file(tmp_path):
31
31
  header = ["foo", "bar"]
32
32
  rows = [["x", "1"], ["y", "2"], ["z", "3"]]
33
33
  file = make_csv(tmp_path, "test.csv", header, rows)
@@ -37,7 +37,7 @@ def test_header_in_jeder_datei(tmp_path):
37
37
  assert f.readline().strip() == "foo;bar"
38
38
 
39
39
 
40
- def test_dateiname_mit_leerzeichen(tmp_path):
40
+ def test_filename_with_spaces(tmp_path):
41
41
  header = ["a", "b"]
42
42
  rows = [["1", "2"]]
43
43
  file = make_csv(tmp_path, "Part-Storage Areas Relationships.csv", header, rows)
@@ -46,7 +46,7 @@ def test_dateiname_mit_leerzeichen(tmp_path):
46
46
  assert (result.output_dir / "Part-Storage Areas Relationships_01.csv").exists()
47
47
 
48
48
 
49
- def test_ungueltige_chunkgroesse(tmp_path):
49
+ def test_invalid_chunk_size(tmp_path):
50
50
  header = ["a", "b"]
51
51
  rows = [["1", "2"]]
52
52
  file = make_csv(tmp_path, "fail.csv", header, rows)
@@ -56,7 +56,7 @@ def test_ungueltige_chunkgroesse(tmp_path):
56
56
  split_csv(file, chunk_size=-1)
57
57
 
58
58
 
59
- def test_leere_datei(tmp_path):
59
+ def test_empty_file(tmp_path):
60
60
  file = tmp_path / "empty.csv"
61
61
  file.write_text("")
62
62
  with pytest.raises(ValueError):
File without changes
File without changes