nm-tool-forge 0.1.0__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/PKG-INFO +57 -4
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/README.md +105 -52
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/pyproject.toml +67 -66
- nm_tool_forge-0.2.4/src/csvchunking/__init__.py +4 -0
- nm_tool_forge-0.2.4/src/csvchunking/__main__.py +4 -0
- nm_tool_forge-0.2.4/src/csvchunking/chunker.py +76 -0
- nm_tool_forge-0.2.4/src/csvchunking/cli.py +31 -0
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/loganalysis/__init__.py +16 -16
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/loganalysis/cli.py +8 -4
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/loganalysis/csv_export.py +7 -7
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/loganalysis/parsing.py +6 -6
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/loganalysis/report_markdown.py +16 -12
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/loganalysis/report_pdf.py +9 -4
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/loganalysis/selftest.py +17 -11
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/nm_tool_forge.egg-info/PKG-INFO +57 -4
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/nm_tool_forge.egg-info/SOURCES.txt +5 -0
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/nm_tool_forge.egg-info/entry_points.txt +1 -0
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/nm_tool_forge.egg-info/top_level.txt +1 -0
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/tests/test_analysis.py +6 -5
- nm_tool_forge-0.2.4/tests/test_csvchunking.py +63 -0
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/tests/test_report_markdown.py +3 -1
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/LICENSE +0 -0
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/setup.cfg +0 -0
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/loganalysis/__main__.py +0 -0
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/loganalysis/analysis.py +0 -0
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/loganalysis/constants.py +0 -0
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/loganalysis/converters.py +0 -0
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/loganalysis/encoding.py +0 -0
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/loganalysis/filesystem.py +0 -0
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/loganalysis/models.py +0 -0
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/loganalysis/normalization.py +0 -0
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/loganalysis/report_html.py +0 -0
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/loganalysis/report_models.py +0 -0
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/nm_tool_forge.egg-info/dependency_links.txt +0 -0
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/src/nm_tool_forge.egg-info/requires.txt +0 -0
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/tests/test_normalization.py +0 -0
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/tests/test_parsing.py +0 -0
- {nm_tool_forge-0.1.0 → nm_tool_forge-0.2.4}/tests/test_report_html.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nm-tool-forge
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: Analyze MigMan log files and generate aggregated CSV, Markdown, HTML, and optional PDF reports.
|
|
5
5
|
Author-email: Stefan Ewald <s.ew@outlook.de>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -31,7 +31,7 @@ Dynamic: license-file
|
|
|
31
31
|
|
|
32
32
|
# nm-tool-forge
|
|
33
33
|
|
|
34
|
-
`nm-tool-forge` analyzes MigMan text log files with severity tokens such as `INFO`, `ERROR`, and `WARNING` and generates aggregated CSV, Markdown, HTML, and optional PDF reports.
|
|
34
|
+
`nm-tool-forge` analyzes MigMan text log files with severity tokens such as `INFO`, `ERROR`, and `WARNING` and generates aggregated CSV, Markdown, HTML, and optional PDF reports. The package also includes `csvchunking`, a small helper for splitting large CSV files into migration-friendly chunks.
|
|
35
35
|
|
|
36
36
|
The project uses a package-ready `src` layout. The legacy `log_analysis.py` file remains available as a thin compatibility entry point for older local setups.
|
|
37
37
|
|
|
@@ -43,6 +43,7 @@ The project uses a package-ready `src` layout. The legacy `log_analysis.py` file
|
|
|
43
43
|
- Generate Markdown summary reports
|
|
44
44
|
- Optionally convert reports to HTML and PDF
|
|
45
45
|
- Keep a backup copy of analyzed log files
|
|
46
|
+
- Split large CSV files into numbered chunks while preserving the header row
|
|
46
47
|
- Run built-in self-tests from the CLI
|
|
47
48
|
|
|
48
49
|
## Installation
|
|
@@ -61,12 +62,14 @@ python -m pip install .[pdf,dev]
|
|
|
61
62
|
|
|
62
63
|
## Command-line usage
|
|
63
64
|
|
|
64
|
-
After installation,
|
|
65
|
+
After installation, the CLI entry points are available:
|
|
65
66
|
|
|
66
67
|
```powershell
|
|
67
68
|
python -m loganalysis --help
|
|
69
|
+
python -m csvchunking --help
|
|
68
70
|
loganalysis --help
|
|
69
71
|
nm-tool-forge --help
|
|
72
|
+
csvchunking --help
|
|
70
73
|
```
|
|
71
74
|
|
|
72
75
|
Typical analysis run:
|
|
@@ -93,8 +96,26 @@ Legacy compatibility call:
|
|
|
93
96
|
python .\log_analysis.py --convert
|
|
94
97
|
```
|
|
95
98
|
|
|
99
|
+
CSV chunking run:
|
|
100
|
+
|
|
101
|
+
```powershell
|
|
102
|
+
csvchunking "data\large_export.csv" --chunk-size 5000
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
The command creates an output directory next to the input file named after the CSV stem. For example, `data\large_export.csv` is split into files such as `data\large_export\large_export_01.csv`, `data\large_export\large_export_02.csv`, and so on.
|
|
106
|
+
|
|
107
|
+
CSV chunking with an explicit encoding:
|
|
108
|
+
|
|
109
|
+
```powershell
|
|
110
|
+
python -m csvchunking "data\large_export.csv" --chunk-size 5000 --encoding utf-8-sig
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Each chunk contains the original header row plus up to `--chunk-size` data rows. The delimiter is detected automatically; if detection fails, semicolon-separated CSV is used.
|
|
114
|
+
|
|
96
115
|
## Supported CLI options
|
|
97
116
|
|
|
117
|
+
Log analysis options:
|
|
118
|
+
|
|
98
119
|
- `--logs-dir`
|
|
99
120
|
- `--out-dir`
|
|
100
121
|
- `--backup-dir`
|
|
@@ -102,6 +123,28 @@ python .\log_analysis.py --convert
|
|
|
102
123
|
- `--convert`
|
|
103
124
|
- `--self-test`
|
|
104
125
|
|
|
126
|
+
CSV chunking options:
|
|
127
|
+
|
|
128
|
+
- `input_file` - path to the CSV file to split
|
|
129
|
+
- `--chunk-size` - required number of data rows per output file; must be greater than zero
|
|
130
|
+
- `--encoding` - input and output encoding; defaults to `utf-8-sig`
|
|
131
|
+
|
|
132
|
+
## Release process
|
|
133
|
+
|
|
134
|
+
To publish a new release, always test on TestPyPI first, then upload to PyPI only after successful Conda smoke tests:
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
export TWINE_USERNAME="__token__"
|
|
138
|
+
export TWINE_PASSWORD="pypi-..."
|
|
139
|
+
|
|
140
|
+
bash scripts/release_testpypi.sh --bump patch
|
|
141
|
+
bash scripts/release_pypi.sh --yes
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
**Notes:**
|
|
145
|
+
- Run and verify the TestPyPI release first, then upload the final package to PyPI.
|
|
146
|
+
- PyPI versions cannot be overwritten or reused.
|
|
147
|
+
|
|
105
148
|
## Library usage
|
|
106
149
|
|
|
107
150
|
```python
|
|
@@ -113,6 +156,7 @@ from loganalysis import (
|
|
|
113
156
|
iter_logical_entries,
|
|
114
157
|
normalize_message,
|
|
115
158
|
)
|
|
159
|
+
from csvchunking import split_csv
|
|
116
160
|
|
|
117
161
|
result = analyze_file(Path("logs/app.txt"))
|
|
118
162
|
print(result["norm_counts"])
|
|
@@ -129,14 +173,21 @@ convert_report_md_to_html_pdf(
|
|
|
129
173
|
Path("log_analyse_out/report.html"),
|
|
130
174
|
Path("log_analyse_out/report.pdf"),
|
|
131
175
|
)
|
|
176
|
+
|
|
177
|
+
chunk_result = split_csv(Path("data/large_export.csv"), chunk_size=5000)
|
|
178
|
+
print(chunk_result.output_dir)
|
|
179
|
+
print(chunk_result.output_files)
|
|
132
180
|
```
|
|
133
181
|
|
|
182
|
+
`split_csv()` returns a `ChunkResult` with the input file, output directory, chunk size, processed data-row count, created file count, and generated output file paths.
|
|
183
|
+
|
|
134
184
|
## Project structure
|
|
135
185
|
|
|
136
186
|
```text
|
|
137
187
|
.
|
|
138
188
|
├─ pyproject.toml
|
|
139
189
|
├─ src/loganalysis/
|
|
190
|
+
├─ src/csvchunking/
|
|
140
191
|
├─ tests/
|
|
141
192
|
├─ docs/
|
|
142
193
|
└─ log_analysis.py
|
|
@@ -151,7 +202,9 @@ Important modules:
|
|
|
151
202
|
- `report_html.py` - HTML/CSS rendering
|
|
152
203
|
- `report_pdf.py` - PDF engine selection and fallback handling
|
|
153
204
|
- `converters.py` - Markdown-to-HTML/PDF conversion
|
|
154
|
-
- `cli.py` - command-line entry point
|
|
205
|
+
- `loganalysis/cli.py` - log analysis command-line entry point
|
|
206
|
+
- `csvchunking/chunker.py` - CSV splitting logic and `ChunkResult`
|
|
207
|
+
- `csvchunking/cli.py` - CSV chunking command-line entry point
|
|
155
208
|
|
|
156
209
|
## HTML/PDF conversion
|
|
157
210
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# nm-tool-forge
|
|
2
2
|
|
|
3
|
-
`nm-tool-forge` analyzes MigMan text log files with severity tokens such as `INFO`, `ERROR`, and `WARNING` and generates aggregated CSV, Markdown, HTML, and optional PDF reports.
|
|
3
|
+
`nm-tool-forge` analyzes MigMan text log files with severity tokens such as `INFO`, `ERROR`, and `WARNING` and generates aggregated CSV, Markdown, HTML, and optional PDF reports. The package also includes `csvchunking`, a small helper for splitting large CSV files into migration-friendly chunks.
|
|
4
4
|
|
|
5
5
|
The project uses a package-ready `src` layout. The legacy `log_analysis.py` file remains available as a thin compatibility entry point for older local setups.
|
|
6
6
|
|
|
@@ -9,10 +9,11 @@ The project uses a package-ready `src` layout. The legacy `log_analysis.py` file
|
|
|
9
9
|
- Parse logical log entries from multi-line text logs
|
|
10
10
|
- Normalize recurring error patterns for better aggregation
|
|
11
11
|
- Generate aggregated CSV reports
|
|
12
|
-
- Generate Markdown summary reports
|
|
13
|
-
- Optionally convert reports to HTML and PDF
|
|
14
|
-
- Keep a backup copy of analyzed log files
|
|
15
|
-
-
|
|
12
|
+
- Generate Markdown summary reports
|
|
13
|
+
- Optionally convert reports to HTML and PDF
|
|
14
|
+
- Keep a backup copy of analyzed log files
|
|
15
|
+
- Split large CSV files into numbered chunks while preserving the header row
|
|
16
|
+
- Run built-in self-tests from the CLI
|
|
16
17
|
|
|
17
18
|
## Installation
|
|
18
19
|
|
|
@@ -30,13 +31,15 @@ python -m pip install .[pdf,dev]
|
|
|
30
31
|
|
|
31
32
|
## Command-line usage
|
|
32
33
|
|
|
33
|
-
After installation,
|
|
34
|
-
|
|
35
|
-
```powershell
|
|
36
|
-
python -m loganalysis --help
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
34
|
+
After installation, the CLI entry points are available:
|
|
35
|
+
|
|
36
|
+
```powershell
|
|
37
|
+
python -m loganalysis --help
|
|
38
|
+
python -m csvchunking --help
|
|
39
|
+
loganalysis --help
|
|
40
|
+
nm-tool-forge --help
|
|
41
|
+
csvchunking --help
|
|
42
|
+
```
|
|
40
43
|
|
|
41
44
|
Typical analysis run:
|
|
42
45
|
|
|
@@ -50,41 +53,82 @@ Analysis with HTML/PDF conversion:
|
|
|
50
53
|
nm-tool-forge --logs-dir logs --out-dir log_analyse_out --convert
|
|
51
54
|
```
|
|
52
55
|
|
|
53
|
-
Self-test mode:
|
|
54
|
-
|
|
55
|
-
```powershell
|
|
56
|
-
python -m loganalysis --self-test
|
|
57
|
-
```
|
|
58
|
-
|
|
59
|
-
Legacy compatibility call:
|
|
60
|
-
|
|
61
|
-
```powershell
|
|
62
|
-
python .\log_analysis.py --convert
|
|
56
|
+
Self-test mode:
|
|
57
|
+
|
|
58
|
+
```powershell
|
|
59
|
+
python -m loganalysis --self-test
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Legacy compatibility call:
|
|
63
|
+
|
|
64
|
+
```powershell
|
|
65
|
+
python .\log_analysis.py --convert
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
CSV chunking run:
|
|
69
|
+
|
|
70
|
+
```powershell
|
|
71
|
+
csvchunking "data\large_export.csv" --chunk-size 5000
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
The command creates an output directory next to the input file named after the CSV stem. For example, `data\large_export.csv` is split into files such as `data\large_export\large_export_01.csv`, `data\large_export\large_export_02.csv`, and so on.
|
|
75
|
+
|
|
76
|
+
CSV chunking with an explicit encoding:
|
|
77
|
+
|
|
78
|
+
```powershell
|
|
79
|
+
python -m csvchunking "data\large_export.csv" --chunk-size 5000 --encoding utf-8-sig
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Each chunk contains the original header row plus up to `--chunk-size` data rows. The delimiter is detected automatically; if detection fails, semicolon-separated CSV is used.
|
|
83
|
+
|
|
84
|
+
## Supported CLI options
|
|
85
|
+
|
|
86
|
+
Log analysis options:
|
|
87
|
+
|
|
88
|
+
- `--logs-dir`
|
|
89
|
+
- `--out-dir`
|
|
90
|
+
- `--backup-dir`
|
|
91
|
+
- `--top-examples`
|
|
92
|
+
- `--convert`
|
|
93
|
+
- `--self-test`
|
|
94
|
+
|
|
95
|
+
CSV chunking options:
|
|
96
|
+
|
|
97
|
+
- `input_file` - path to the CSV file to split
|
|
98
|
+
- `--chunk-size` - required number of data rows per output file; must be greater than zero
|
|
99
|
+
- `--encoding` - input and output encoding; defaults to `utf-8-sig`
|
|
100
|
+
|
|
101
|
+
## Release process
|
|
102
|
+
|
|
103
|
+
To publish a new release, always test on TestPyPI first, then upload to PyPI only after successful Conda smoke tests:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
export TWINE_USERNAME="__token__"
|
|
107
|
+
export TWINE_PASSWORD="pypi-..."
|
|
108
|
+
|
|
109
|
+
bash scripts/release_testpypi.sh --bump patch
|
|
110
|
+
bash scripts/release_pypi.sh --yes
|
|
63
111
|
```
|
|
64
112
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
- `--self-test`
|
|
73
|
-
|
|
74
|
-
## Library usage
|
|
75
|
-
|
|
76
|
-
```python
|
|
113
|
+
**Notes:**
|
|
114
|
+
- Run and verify the TestPyPI release first, then upload the final package to PyPI.
|
|
115
|
+
- PyPI versions cannot be overwritten or reused.
|
|
116
|
+
|
|
117
|
+
## Library usage
|
|
118
|
+
|
|
119
|
+
```python
|
|
77
120
|
from pathlib import Path
|
|
78
121
|
|
|
79
122
|
from loganalysis import (
|
|
80
123
|
analyze_file,
|
|
81
124
|
convert_report_md_to_html_pdf,
|
|
82
|
-
iter_logical_entries,
|
|
83
|
-
normalize_message,
|
|
84
|
-
)
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
125
|
+
iter_logical_entries,
|
|
126
|
+
normalize_message,
|
|
127
|
+
)
|
|
128
|
+
from csvchunking import split_csv
|
|
129
|
+
|
|
130
|
+
result = analyze_file(Path("logs/app.txt"))
|
|
131
|
+
print(result["norm_counts"])
|
|
88
132
|
|
|
89
133
|
print(normalize_message(
|
|
90
134
|
'Conversion: X =3100110. 138 The record was not found in table "Teile".'
|
|
@@ -95,20 +139,27 @@ for entry in iter_logical_entries(Path("logs/app.txt")):
|
|
|
95
139
|
|
|
96
140
|
convert_report_md_to_html_pdf(
|
|
97
141
|
Path("log_analyse_out/report.md"),
|
|
98
|
-
Path("log_analyse_out/report.html"),
|
|
99
|
-
Path("log_analyse_out/report.pdf"),
|
|
100
|
-
)
|
|
101
|
-
|
|
142
|
+
Path("log_analyse_out/report.html"),
|
|
143
|
+
Path("log_analyse_out/report.pdf"),
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
chunk_result = split_csv(Path("data/large_export.csv"), chunk_size=5000)
|
|
147
|
+
print(chunk_result.output_dir)
|
|
148
|
+
print(chunk_result.output_files)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
`split_csv()` returns a `ChunkResult` with the input file, output directory, chunk size, processed data-row count, created file count, and generated output file paths.
|
|
102
152
|
|
|
103
153
|
## Project structure
|
|
104
154
|
|
|
105
155
|
```text
|
|
106
156
|
.
|
|
107
|
-
├─ pyproject.toml
|
|
108
|
-
├─ src/loganalysis/
|
|
109
|
-
├─
|
|
110
|
-
├─
|
|
111
|
-
|
|
157
|
+
├─ pyproject.toml
|
|
158
|
+
├─ src/loganalysis/
|
|
159
|
+
├─ src/csvchunking/
|
|
160
|
+
├─ tests/
|
|
161
|
+
├─ docs/
|
|
162
|
+
└─ log_analysis.py
|
|
112
163
|
```
|
|
113
164
|
|
|
114
165
|
Important modules:
|
|
@@ -118,9 +169,11 @@ Important modules:
|
|
|
118
169
|
- `normalization.py` - message normalization
|
|
119
170
|
- `report_markdown.py` - Markdown report model and rendering
|
|
120
171
|
- `report_html.py` - HTML/CSS rendering
|
|
121
|
-
- `report_pdf.py` - PDF engine selection and fallback handling
|
|
122
|
-
- `converters.py` - Markdown-to-HTML/PDF conversion
|
|
123
|
-
- `cli.py` - command-line entry point
|
|
172
|
+
- `report_pdf.py` - PDF engine selection and fallback handling
|
|
173
|
+
- `converters.py` - Markdown-to-HTML/PDF conversion
|
|
174
|
+
- `loganalysis/cli.py` - log analysis command-line entry point
|
|
175
|
+
- `csvchunking/chunker.py` - CSV splitting logic and `ChunkResult`
|
|
176
|
+
- `csvchunking/cli.py` - CSV chunking command-line entry point
|
|
124
177
|
|
|
125
178
|
## HTML/PDF conversion
|
|
126
179
|
|
|
@@ -1,67 +1,68 @@
|
|
|
1
|
-
[build-system]
|
|
2
|
-
requires = ["setuptools>=69", "wheel"]
|
|
3
|
-
build-backend = "setuptools.build_meta"
|
|
4
|
-
|
|
5
|
-
[project]
|
|
6
|
-
name = "nm-tool-forge"
|
|
7
|
-
version = "0.
|
|
8
|
-
description = "Analyze MigMan log files and generate aggregated CSV, Markdown, HTML, and optional PDF reports."
|
|
9
|
-
readme = { file = "README.md", content-type = "text/markdown" }
|
|
10
|
-
requires-python = ">=3.10"
|
|
11
|
-
license = "MIT"
|
|
12
|
-
license-files = ["LICENSE"]
|
|
13
|
-
authors = [
|
|
14
|
-
{ name = "Stefan Ewald", email = "s.ew@outlook.de" }
|
|
15
|
-
]
|
|
16
|
-
keywords = ["migman", "logs", "analysis", "reporting", "csv", "markdown", "pdf"]
|
|
17
|
-
classifiers = [
|
|
18
|
-
"Development Status :: 4 - Beta",
|
|
19
|
-
"Intended Audience :: Developers",
|
|
20
|
-
"Programming Language :: Python :: 3",
|
|
21
|
-
"Programming Language :: Python :: 3.10",
|
|
22
|
-
"Programming Language :: Python :: 3.11",
|
|
23
|
-
"Programming Language :: Python :: 3.12",
|
|
24
|
-
"Programming Language :: Python :: 3.13",
|
|
25
|
-
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
26
|
-
"Topic :: Utilities",
|
|
27
|
-
]
|
|
28
|
-
|
|
29
|
-
dependencies = [
|
|
30
|
-
"chardet>=5.0",
|
|
31
|
-
]
|
|
32
|
-
|
|
33
|
-
[project.optional-dependencies]
|
|
34
|
-
pdf = [
|
|
35
|
-
"weasyprint>=62",
|
|
36
|
-
]
|
|
37
|
-
dev = [
|
|
38
|
-
"pytest>=8.0",
|
|
39
|
-
"build>=1.2",
|
|
40
|
-
"twine>=5.0",
|
|
41
|
-
"ruff>=0.11",
|
|
42
|
-
]
|
|
43
|
-
|
|
44
|
-
[project.urls]
|
|
45
|
-
Homepage = "https://github.com/Jack736-ui/migman_log"
|
|
46
|
-
Issues = "https://github.com/Jack736-ui/migman_log/issues"
|
|
47
|
-
|
|
48
|
-
[project.scripts]
|
|
49
|
-
nm-tool-forge = "loganalysis.cli:main"
|
|
50
|
-
loganalysis = "loganalysis.cli:main"
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=69", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "nm-tool-forge"
|
|
7
|
+
version = "0.2.4"
|
|
8
|
+
description = "Analyze MigMan log files and generate aggregated CSV, Markdown, HTML, and optional PDF reports."
|
|
9
|
+
readme = { file = "README.md", content-type = "text/markdown" }
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
license-files = ["LICENSE"]
|
|
13
|
+
authors = [
|
|
14
|
+
{ name = "Stefan Ewald", email = "s.ew@outlook.de" }
|
|
15
|
+
]
|
|
16
|
+
keywords = ["migman", "logs", "analysis", "reporting", "csv", "markdown", "pdf"]
|
|
17
|
+
classifiers = [
|
|
18
|
+
"Development Status :: 4 - Beta",
|
|
19
|
+
"Intended Audience :: Developers",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Programming Language :: Python :: 3.13",
|
|
25
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
26
|
+
"Topic :: Utilities",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
dependencies = [
|
|
30
|
+
"chardet>=5.0",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.optional-dependencies]
|
|
34
|
+
pdf = [
|
|
35
|
+
"weasyprint>=62",
|
|
36
|
+
]
|
|
37
|
+
dev = [
|
|
38
|
+
"pytest>=8.0",
|
|
39
|
+
"build>=1.2",
|
|
40
|
+
"twine>=5.0",
|
|
41
|
+
"ruff>=0.11",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
[project.urls]
|
|
45
|
+
Homepage = "https://github.com/Jack736-ui/migman_log"
|
|
46
|
+
Issues = "https://github.com/Jack736-ui/migman_log/issues"
|
|
47
|
+
|
|
48
|
+
[project.scripts]
|
|
49
|
+
nm-tool-forge = "loganalysis.cli:main"
|
|
50
|
+
loganalysis = "loganalysis.cli:main"
|
|
51
|
+
csvchunking = "csvchunking.cli:main"
|
|
52
|
+
|
|
53
|
+
[tool.setuptools]
|
|
54
|
+
package-dir = { "" = "src" }
|
|
55
|
+
|
|
56
|
+
[tool.setuptools.packages.find]
|
|
57
|
+
where = ["src"]
|
|
58
|
+
|
|
59
|
+
[tool.pytest.ini_options]
|
|
60
|
+
testpaths = ["tests"]
|
|
61
|
+
addopts = "--basetemp=tests_tmp"
|
|
62
|
+
|
|
63
|
+
[tool.ruff]
|
|
64
|
+
line-length = 120
|
|
65
|
+
target-version = "py310"
|
|
66
|
+
|
|
67
|
+
[tool.ruff.lint]
|
|
67
68
|
select = ["E", "F", "I", "B", "UP"]
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass(frozen=True)
|
|
7
|
+
class ChunkResult:
|
|
8
|
+
input_file: Path
|
|
9
|
+
output_dir: Path
|
|
10
|
+
chunk_size: int
|
|
11
|
+
data_rows_processed: int
|
|
12
|
+
files_created: int
|
|
13
|
+
output_files: tuple[Path, ...]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def split_csv(
|
|
17
|
+
input_file: Path,
|
|
18
|
+
chunk_size: int,
|
|
19
|
+
encoding: str = "utf-8-sig",
|
|
20
|
+
) -> ChunkResult:
|
|
21
|
+
if not Path(input_file).is_file():
|
|
22
|
+
raise FileNotFoundError(f"Input file not found: {input_file}")
|
|
23
|
+
if chunk_size <= 0:
|
|
24
|
+
raise ValueError("chunk_size must be greater than 0")
|
|
25
|
+
|
|
26
|
+
input_file = Path(input_file)
|
|
27
|
+
output_dir = input_file.parent / input_file.stem
|
|
28
|
+
output_dir.mkdir(exist_ok=True)
|
|
29
|
+
|
|
30
|
+
# Detect the delimiter automatically.
|
|
31
|
+
with open(input_file, encoding=encoding, newline="") as f:
|
|
32
|
+
sample = f.read(4096)
|
|
33
|
+
f.seek(0)
|
|
34
|
+
sniffer = csv.Sniffer()
|
|
35
|
+
try:
|
|
36
|
+
dialect = sniffer.sniff(sample)
|
|
37
|
+
except Exception:
|
|
38
|
+
dialect = csv.excel
|
|
39
|
+
dialect.delimiter = ";"
|
|
40
|
+
reader = csv.reader(f, dialect)
|
|
41
|
+
try:
|
|
42
|
+
header = next(reader)
|
|
43
|
+
except StopIteration as exc:
|
|
44
|
+
raise ValueError("Input file is empty.") from exc
|
|
45
|
+
chunk = []
|
|
46
|
+
file_count = 0
|
|
47
|
+
data_rows = 0
|
|
48
|
+
output_files = []
|
|
49
|
+
for row in reader:
|
|
50
|
+
chunk.append(row)
|
|
51
|
+
data_rows += 1
|
|
52
|
+
if len(chunk) == chunk_size:
|
|
53
|
+
file_count += 1
|
|
54
|
+
out_path = output_dir / f"{input_file.stem}_{file_count:02d}{input_file.suffix}"
|
|
55
|
+
with open(out_path, "w", encoding=encoding, newline="") as out:
|
|
56
|
+
writer = csv.writer(out, dialect)
|
|
57
|
+
writer.writerow(header)
|
|
58
|
+
writer.writerows(chunk)
|
|
59
|
+
output_files.append(out_path)
|
|
60
|
+
chunk = []
|
|
61
|
+
if chunk:
|
|
62
|
+
file_count += 1
|
|
63
|
+
out_path = output_dir / f"{input_file.stem}_{file_count:02d}{input_file.suffix}"
|
|
64
|
+
with open(out_path, "w", encoding=encoding, newline="") as out:
|
|
65
|
+
writer = csv.writer(out, dialect)
|
|
66
|
+
writer.writerow(header)
|
|
67
|
+
writer.writerows(chunk)
|
|
68
|
+
output_files.append(out_path)
|
|
69
|
+
return ChunkResult(
|
|
70
|
+
input_file=input_file,
|
|
71
|
+
output_dir=output_dir,
|
|
72
|
+
chunk_size=chunk_size,
|
|
73
|
+
data_rows_processed=data_rows,
|
|
74
|
+
files_created=file_count,
|
|
75
|
+
output_files=tuple(output_files),
|
|
76
|
+
)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from .chunker import split_csv
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def main() -> None:
|
|
9
|
+
parser = argparse.ArgumentParser(
|
|
10
|
+
description="Split a large CSV file into smaller chunks with a header row.",
|
|
11
|
+
)
|
|
12
|
+
parser.add_argument("input_file", help="Path to the CSV file")
|
|
13
|
+
parser.add_argument(
|
|
14
|
+
"--chunk-size",
|
|
15
|
+
type=int,
|
|
16
|
+
required=True,
|
|
17
|
+
help="Number of data rows per output file; must be greater than 0",
|
|
18
|
+
)
|
|
19
|
+
parser.add_argument("--encoding", default="utf-8-sig", help="Input and output encoding (Default: utf-8-sig)")
|
|
20
|
+
args = parser.parse_args()
|
|
21
|
+
try:
|
|
22
|
+
result = split_csv(Path(args.input_file), args.chunk_size, encoding=args.encoding)
|
|
23
|
+
except Exception as e:
|
|
24
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
25
|
+
sys.exit(1)
|
|
26
|
+
print("CSV chunking completed.")
|
|
27
|
+
print(f"- Input: {result.input_file}")
|
|
28
|
+
print(f"- Output directory: {result.output_dir}")
|
|
29
|
+
print(f"- Chunk size: {result.chunk_size}")
|
|
30
|
+
print(f"- Data rows processed: {result.data_rows_processed}")
|
|
31
|
+
print(f"- Files created: {result.files_created}")
|
|
@@ -1,16 +1,16 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from .analysis import analyze_file, run_analysis
|
|
4
|
-
from .converters import convert_report_md_to_html_pdf
|
|
5
|
-
from .normalization import normalize_message
|
|
6
|
-
from .parsing import iter_logical_entries
|
|
7
|
-
|
|
8
|
-
__all__ = [
|
|
9
|
-
"analyze_file",
|
|
10
|
-
"convert_report_md_to_html_pdf",
|
|
11
|
-
"iter_logical_entries",
|
|
12
|
-
"normalize_message",
|
|
13
|
-
"run_analysis",
|
|
14
|
-
]
|
|
15
|
-
|
|
16
|
-
__version__ = "0.
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .analysis import analyze_file, run_analysis
|
|
4
|
+
from .converters import convert_report_md_to_html_pdf
|
|
5
|
+
from .normalization import normalize_message
|
|
6
|
+
from .parsing import iter_logical_entries
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"analyze_file",
|
|
10
|
+
"convert_report_md_to_html_pdf",
|
|
11
|
+
"iter_logical_entries",
|
|
12
|
+
"normalize_message",
|
|
13
|
+
"run_analysis",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
__version__ = "0.2.4"
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import argparse
|
|
4
|
-
from
|
|
5
|
-
from
|
|
3
|
+
import argparse
|
|
4
|
+
from collections.abc import Sequence
|
|
5
|
+
from pathlib import Path
|
|
6
6
|
|
|
7
7
|
from .analysis import NoLogFilesError, run_analysis
|
|
8
8
|
from .constants import DEFAULT_LOGS_DIR, DEFAULT_OUT_DIR, DEFAULT_TOP_EXAMPLES, EXIT_SUCCESS
|
|
@@ -16,7 +16,11 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
16
16
|
parser = argparse.ArgumentParser(
|
|
17
17
|
description="Aggregated analysis of log files (INFO/ERROR/WARNING) in logs/*.txt",
|
|
18
18
|
)
|
|
19
|
-
parser.add_argument(
|
|
19
|
+
parser.add_argument(
|
|
20
|
+
"--logs-dir",
|
|
21
|
+
default=DEFAULT_LOGS_DIR,
|
|
22
|
+
help=f"Subdirectory with log files (Default: {DEFAULT_LOGS_DIR})",
|
|
23
|
+
)
|
|
20
24
|
parser.add_argument("--out-dir", default=DEFAULT_OUT_DIR, help=f"Output directory (Default: {DEFAULT_OUT_DIR})")
|
|
21
25
|
parser.add_argument("--backup-dir", default=None, help="Backup directory (Default: <out-dir>/backup)")
|
|
22
26
|
parser.add_argument(
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import csv
|
|
4
|
-
from
|
|
5
|
-
from
|
|
6
|
-
|
|
7
|
-
from .filesystem import ensure_dir
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from .filesystem import ensure_dir
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def write_csv(path: Path, rows: Iterable[tuple[str, str, int]], headers: list[str]) -> None:
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from
|
|
4
|
-
from
|
|
5
|
-
|
|
6
|
-
from .constants import RE_ENTRY_START, RE_LINE_PREFIX, RE_TRAILING_DATASET, RE_WHITESPACE, SEVERITY_ALIASES
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from .constants import RE_ENTRY_START, RE_LINE_PREFIX, RE_TRAILING_DATASET, RE_WHITESPACE, SEVERITY_ALIASES
|
|
7
7
|
from .encoding import detect_encoding
|
|
8
8
|
from .models import ParsedLine
|
|
9
9
|
|
|
@@ -52,12 +52,14 @@ def build_markdown_report(
|
|
|
52
52
|
continue
|
|
53
53
|
|
|
54
54
|
lines.append("| Severity | Count | Normalized message | Examples |")
|
|
55
|
-
lines.append("|---|---:|---|---|")
|
|
56
|
-
for (severity, normalized_message), count in top_norm:
|
|
57
|
-
examples_counter = analysis.norm_examples[(severity, normalized_message)]
|
|
58
|
-
examples = [
|
|
59
|
-
|
|
60
|
-
|
|
55
|
+
lines.append("|---|---:|---|---|")
|
|
56
|
+
for (severity, normalized_message), count in top_norm:
|
|
57
|
+
examples_counter = analysis.norm_examples[(severity, normalized_message)]
|
|
58
|
+
examples = [
|
|
59
|
+
f"{message} ({amount})" for message, amount in examples_counter.most_common(config.top_examples)
|
|
60
|
+
]
|
|
61
|
+
examples_text = "<br>".join(examples) if examples else ""
|
|
62
|
+
lines.append(f"| {severity} | {count} | {normalized_message} | {examples_text} |")
|
|
61
63
|
lines.append("")
|
|
62
64
|
|
|
63
65
|
lines.append("## Overall summary (all files)")
|
|
@@ -65,12 +67,14 @@ def build_markdown_report(
|
|
|
65
67
|
top_global = _top_counter_items(summary.global_norm, REPORT_TOP_GLOBAL)
|
|
66
68
|
if top_global:
|
|
67
69
|
lines.append("| Severity | Count | Normalized message | Examples |")
|
|
68
|
-
lines.append("|---|---:|---|---|")
|
|
69
|
-
for (severity, normalized_message), count in top_global:
|
|
70
|
-
examples_counter = summary.global_norm_examples[(severity, normalized_message)]
|
|
71
|
-
examples = [
|
|
72
|
-
|
|
73
|
-
|
|
70
|
+
lines.append("|---|---:|---|---|")
|
|
71
|
+
for (severity, normalized_message), count in top_global:
|
|
72
|
+
examples_counter = summary.global_norm_examples[(severity, normalized_message)]
|
|
73
|
+
examples = [
|
|
74
|
+
f"{message} ({amount})" for message, amount in examples_counter.most_common(config.top_examples)
|
|
75
|
+
]
|
|
76
|
+
examples_text = "<br>".join(examples) if examples else ""
|
|
77
|
+
lines.append(f"| {severity} | {count} | {normalized_message} | {examples_text} |")
|
|
74
78
|
lines.append("")
|
|
75
79
|
else:
|
|
76
80
|
lines.append("_No messages found._")
|
|
@@ -1,10 +1,15 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import shutil
|
|
4
|
-
from contextlib import redirect_stderr, redirect_stdout
|
|
5
|
-
from io import StringIO
|
|
6
|
-
|
|
7
|
-
from .constants import
|
|
4
|
+
from contextlib import redirect_stderr, redirect_stdout
|
|
5
|
+
from io import StringIO
|
|
6
|
+
|
|
7
|
+
from .constants import (
|
|
8
|
+
COMMON_MOJIBAKE_TOKENS,
|
|
9
|
+
LATEX_PDF_ENGINES,
|
|
10
|
+
LATEX_SPECIAL_CHAR_REPLACEMENTS,
|
|
11
|
+
RE_MARKDOWN_TABLE_SEPARATOR,
|
|
12
|
+
)
|
|
8
13
|
|
|
9
14
|
|
|
10
15
|
def select_pdf_engine() -> str | None:
|
|
@@ -9,17 +9,23 @@ from .report_pdf import build_pdf_safe_markdown, escape_latex_text, make_markdow
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def run_self_tests() -> None:
|
|
12
|
-
"""Run deterministic built-in assertions for quick local verification."""
|
|
13
|
-
|
|
14
|
-
for raw_message, expected in NORMALIZATION_SELF_TEST_CASES:
|
|
15
|
-
actual = normalize_message(raw_message)
|
|
16
|
-
assert actual == expected, f"normalize_message({raw_message!r}) -> {actual!r}, expected {expected!r}"
|
|
17
|
-
|
|
18
|
-
assert is_entry_start("ERROR\tLine 1: tab-separated severity")
|
|
19
|
-
assert
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
12
|
+
"""Run deterministic built-in assertions for quick local verification."""
|
|
13
|
+
|
|
14
|
+
for raw_message, expected in NORMALIZATION_SELF_TEST_CASES:
|
|
15
|
+
actual = normalize_message(raw_message)
|
|
16
|
+
assert actual == expected, f"normalize_message({raw_message!r}) -> {actual!r}, expected {expected!r}"
|
|
17
|
+
|
|
18
|
+
assert is_entry_start("ERROR\tLine 1: tab-separated severity")
|
|
19
|
+
assert (
|
|
20
|
+
escape_latex_text(r"D:\DATEN_UEBERNAHME\A&B")
|
|
21
|
+
== r"D:\textbackslash{}DATEN\_UEBERNAHME\textbackslash{}A\&B"
|
|
22
|
+
)
|
|
23
|
+
assert make_markdown_table_line_pdf_safe("|---|---:|---|") == "|---|---:|---|"
|
|
24
|
+
assert (
|
|
25
|
+
make_markdown_table_line_pdf_safe(r"| ERROR | D:\DATEN_1<br>foo |")
|
|
26
|
+
== r"| ERROR | D:\textbackslash{}DATEN\_1 ; foo |"
|
|
27
|
+
)
|
|
28
|
+
assert build_pdf_safe_markdown("plain\n| A | B |\n").endswith("\n")
|
|
23
29
|
|
|
24
30
|
sample_report_markdown = """# Log Analysis Report
|
|
25
31
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nm-tool-forge
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: Analyze MigMan log files and generate aggregated CSV, Markdown, HTML, and optional PDF reports.
|
|
5
5
|
Author-email: Stefan Ewald <s.ew@outlook.de>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -31,7 +31,7 @@ Dynamic: license-file
|
|
|
31
31
|
|
|
32
32
|
# nm-tool-forge
|
|
33
33
|
|
|
34
|
-
`nm-tool-forge` analyzes MigMan text log files with severity tokens such as `INFO`, `ERROR`, and `WARNING` and generates aggregated CSV, Markdown, HTML, and optional PDF reports.
|
|
34
|
+
`nm-tool-forge` analyzes MigMan text log files with severity tokens such as `INFO`, `ERROR`, and `WARNING` and generates aggregated CSV, Markdown, HTML, and optional PDF reports. The package also includes `csvchunking`, a small helper for splitting large CSV files into migration-friendly chunks.
|
|
35
35
|
|
|
36
36
|
The project uses a package-ready `src` layout. The legacy `log_analysis.py` file remains available as a thin compatibility entry point for older local setups.
|
|
37
37
|
|
|
@@ -43,6 +43,7 @@ The project uses a package-ready `src` layout. The legacy `log_analysis.py` file
|
|
|
43
43
|
- Generate Markdown summary reports
|
|
44
44
|
- Optionally convert reports to HTML and PDF
|
|
45
45
|
- Keep a backup copy of analyzed log files
|
|
46
|
+
- Split large CSV files into numbered chunks while preserving the header row
|
|
46
47
|
- Run built-in self-tests from the CLI
|
|
47
48
|
|
|
48
49
|
## Installation
|
|
@@ -61,12 +62,14 @@ python -m pip install .[pdf,dev]
|
|
|
61
62
|
|
|
62
63
|
## Command-line usage
|
|
63
64
|
|
|
64
|
-
After installation,
|
|
65
|
+
After installation, the CLI entry points are available:
|
|
65
66
|
|
|
66
67
|
```powershell
|
|
67
68
|
python -m loganalysis --help
|
|
69
|
+
python -m csvchunking --help
|
|
68
70
|
loganalysis --help
|
|
69
71
|
nm-tool-forge --help
|
|
72
|
+
csvchunking --help
|
|
70
73
|
```
|
|
71
74
|
|
|
72
75
|
Typical analysis run:
|
|
@@ -93,8 +96,26 @@ Legacy compatibility call:
|
|
|
93
96
|
python .\log_analysis.py --convert
|
|
94
97
|
```
|
|
95
98
|
|
|
99
|
+
CSV chunking run:
|
|
100
|
+
|
|
101
|
+
```powershell
|
|
102
|
+
csvchunking "data\large_export.csv" --chunk-size 5000
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
The command creates an output directory next to the input file named after the CSV stem. For example, `data\large_export.csv` is split into files such as `data\large_export\large_export_01.csv`, `data\large_export\large_export_02.csv`, and so on.
|
|
106
|
+
|
|
107
|
+
CSV chunking with an explicit encoding:
|
|
108
|
+
|
|
109
|
+
```powershell
|
|
110
|
+
python -m csvchunking "data\large_export.csv" --chunk-size 5000 --encoding utf-8-sig
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Each chunk contains the original header row plus up to `--chunk-size` data rows. The delimiter is detected automatically; if detection fails, semicolon-separated CSV is used.
|
|
114
|
+
|
|
96
115
|
## Supported CLI options
|
|
97
116
|
|
|
117
|
+
Log analysis options:
|
|
118
|
+
|
|
98
119
|
- `--logs-dir`
|
|
99
120
|
- `--out-dir`
|
|
100
121
|
- `--backup-dir`
|
|
@@ -102,6 +123,28 @@ python .\log_analysis.py --convert
|
|
|
102
123
|
- `--convert`
|
|
103
124
|
- `--self-test`
|
|
104
125
|
|
|
126
|
+
CSV chunking options:
|
|
127
|
+
|
|
128
|
+
- `input_file` - path to the CSV file to split
|
|
129
|
+
- `--chunk-size` - required number of data rows per output file; must be greater than zero
|
|
130
|
+
- `--encoding` - input and output encoding; defaults to `utf-8-sig`
|
|
131
|
+
|
|
132
|
+
## Release process
|
|
133
|
+
|
|
134
|
+
To publish a new release, always test on TestPyPI first, then upload to PyPI only after successful Conda smoke tests:
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
export TWINE_USERNAME="__token__"
|
|
138
|
+
export TWINE_PASSWORD="pypi-..."
|
|
139
|
+
|
|
140
|
+
bash scripts/release_testpypi.sh --bump patch
|
|
141
|
+
bash scripts/release_pypi.sh --yes
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
**Notes:**
|
|
145
|
+
- Run and verify the TestPyPI release first, then upload the final package to PyPI.
|
|
146
|
+
- PyPI versions cannot be overwritten or reused.
|
|
147
|
+
|
|
105
148
|
## Library usage
|
|
106
149
|
|
|
107
150
|
```python
|
|
@@ -113,6 +156,7 @@ from loganalysis import (
|
|
|
113
156
|
iter_logical_entries,
|
|
114
157
|
normalize_message,
|
|
115
158
|
)
|
|
159
|
+
from csvchunking import split_csv
|
|
116
160
|
|
|
117
161
|
result = analyze_file(Path("logs/app.txt"))
|
|
118
162
|
print(result["norm_counts"])
|
|
@@ -129,14 +173,21 @@ convert_report_md_to_html_pdf(
|
|
|
129
173
|
Path("log_analyse_out/report.html"),
|
|
130
174
|
Path("log_analyse_out/report.pdf"),
|
|
131
175
|
)
|
|
176
|
+
|
|
177
|
+
chunk_result = split_csv(Path("data/large_export.csv"), chunk_size=5000)
|
|
178
|
+
print(chunk_result.output_dir)
|
|
179
|
+
print(chunk_result.output_files)
|
|
132
180
|
```
|
|
133
181
|
|
|
182
|
+
`split_csv()` returns a `ChunkResult` with the input file, output directory, chunk size, processed data-row count, created file count, and generated output file paths.
|
|
183
|
+
|
|
134
184
|
## Project structure
|
|
135
185
|
|
|
136
186
|
```text
|
|
137
187
|
.
|
|
138
188
|
├─ pyproject.toml
|
|
139
189
|
├─ src/loganalysis/
|
|
190
|
+
├─ src/csvchunking/
|
|
140
191
|
├─ tests/
|
|
141
192
|
├─ docs/
|
|
142
193
|
└─ log_analysis.py
|
|
@@ -151,7 +202,9 @@ Important modules:
|
|
|
151
202
|
- `report_html.py` - HTML/CSS rendering
|
|
152
203
|
- `report_pdf.py` - PDF engine selection and fallback handling
|
|
153
204
|
- `converters.py` - Markdown-to-HTML/PDF conversion
|
|
154
|
-
- `cli.py` - command-line entry point
|
|
205
|
+
- `loganalysis/cli.py` - log analysis command-line entry point
|
|
206
|
+
- `csvchunking/chunker.py` - CSV splitting logic and `ChunkResult`
|
|
207
|
+
- `csvchunking/cli.py` - CSV chunking command-line entry point
|
|
155
208
|
|
|
156
209
|
## HTML/PDF conversion
|
|
157
210
|
|
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
LICENSE
|
|
2
2
|
README.md
|
|
3
3
|
pyproject.toml
|
|
4
|
+
src/csvchunking/__init__.py
|
|
5
|
+
src/csvchunking/__main__.py
|
|
6
|
+
src/csvchunking/chunker.py
|
|
7
|
+
src/csvchunking/cli.py
|
|
4
8
|
src/loganalysis/__init__.py
|
|
5
9
|
src/loganalysis/__main__.py
|
|
6
10
|
src/loganalysis/analysis.py
|
|
@@ -25,6 +29,7 @@ src/nm_tool_forge.egg-info/entry_points.txt
|
|
|
25
29
|
src/nm_tool_forge.egg-info/requires.txt
|
|
26
30
|
src/nm_tool_forge.egg-info/top_level.txt
|
|
27
31
|
tests/test_analysis.py
|
|
32
|
+
tests/test_csvchunking.py
|
|
28
33
|
tests/test_normalization.py
|
|
29
34
|
tests/test_parsing.py
|
|
30
35
|
tests/test_report_html.py
|
|
@@ -18,11 +18,12 @@ def test_analyze_file_aggregates_raw_and_normalized_counts(tmp_path: Path) -> No
|
|
|
18
18
|
result = analyze_file(log_path)
|
|
19
19
|
|
|
20
20
|
assert result.total_lines == 3
|
|
21
|
-
assert result.total_entries == 3
|
|
22
|
-
assert result.unknown_lines == 0
|
|
23
|
-
assert result.raw_counts[("WARNING", "Different issue")] == 1
|
|
24
|
-
|
|
25
|
-
assert
|
|
21
|
+
assert result.total_entries == 3
|
|
22
|
+
assert result.unknown_lines == 0
|
|
23
|
+
assert result.raw_counts[("WARNING", "Different issue")] == 1
|
|
24
|
+
normalized_key = ("ERROR", 'Conversion: X =<VALUE> The record was not found in table "Teile".')
|
|
25
|
+
assert result.norm_counts[normalized_key] == 2
|
|
26
|
+
assert len(result.norm_examples[normalized_key]) == 2
|
|
26
27
|
|
|
27
28
|
|
|
28
29
|
def test_run_analysis_writes_outputs_and_report(tmp_path: Path) -> None:
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from csvchunking.chunker import split_csv
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def make_csv(tmp_path, name, header, rows, encoding="utf-8-sig", delimiter=";"):
|
|
7
|
+
file = tmp_path / name
|
|
8
|
+
with open(file, "w", encoding=encoding, newline="") as f:
|
|
9
|
+
f.write(delimiter.join(header) + "\n")
|
|
10
|
+
for row in rows:
|
|
11
|
+
f.write(delimiter.join(row) + "\n")
|
|
12
|
+
return file
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_regular_split(tmp_path):
|
|
16
|
+
header = ["col1", "col2"]
|
|
17
|
+
rows = [["A", "1"], ["B", "2"], ["C", "3"], ["D", "4"], ["E", "5"]]
|
|
18
|
+
file = make_csv(tmp_path, "sample.csv", header, rows)
|
|
19
|
+
result = split_csv(file, chunk_size=2)
|
|
20
|
+
assert result.files_created == 3
|
|
21
|
+
for out in result.output_files:
|
|
22
|
+
with open(out, encoding="utf-8-sig") as f:
|
|
23
|
+
lines = f.read().splitlines()
|
|
24
|
+
assert lines[0] == "col1;col2"
|
|
25
|
+
assert (result.output_dir / "sample_01.csv").exists()
|
|
26
|
+
assert (result.output_dir / "sample_02.csv").exists()
|
|
27
|
+
assert (result.output_dir / "sample_03.csv").exists()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_header_in_each_file(tmp_path):
|
|
31
|
+
header = ["foo", "bar"]
|
|
32
|
+
rows = [["x", "1"], ["y", "2"], ["z", "3"]]
|
|
33
|
+
file = make_csv(tmp_path, "test.csv", header, rows)
|
|
34
|
+
result = split_csv(file, chunk_size=1)
|
|
35
|
+
for out in result.output_files:
|
|
36
|
+
with open(out, encoding="utf-8-sig") as f:
|
|
37
|
+
assert f.readline().strip() == "foo;bar"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_filename_with_spaces(tmp_path):
|
|
41
|
+
header = ["a", "b"]
|
|
42
|
+
rows = [["1", "2"]]
|
|
43
|
+
file = make_csv(tmp_path, "Part-Storage Areas Relationships.csv", header, rows)
|
|
44
|
+
result = split_csv(file, chunk_size=1)
|
|
45
|
+
assert result.output_dir.name == "Part-Storage Areas Relationships"
|
|
46
|
+
assert (result.output_dir / "Part-Storage Areas Relationships_01.csv").exists()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_invalid_chunk_size(tmp_path):
|
|
50
|
+
header = ["a", "b"]
|
|
51
|
+
rows = [["1", "2"]]
|
|
52
|
+
file = make_csv(tmp_path, "fail.csv", header, rows)
|
|
53
|
+
with pytest.raises(ValueError):
|
|
54
|
+
split_csv(file, chunk_size=0)
|
|
55
|
+
with pytest.raises(ValueError):
|
|
56
|
+
split_csv(file, chunk_size=-1)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_empty_file(tmp_path):
|
|
60
|
+
file = tmp_path / "empty.csv"
|
|
61
|
+
file.write_text("")
|
|
62
|
+
with pytest.raises(ValueError):
|
|
63
|
+
split_csv(file, chunk_size=1)
|
|
@@ -16,7 +16,9 @@ def test_build_and_parse_markdown_report_roundtrip(tmp_path: Path) -> None:
|
|
|
16
16
|
unknown_lines=0,
|
|
17
17
|
raw_counts=Counter({("ERROR", 'Conversion: X =3100110. 138 The record was not found in table "Teile".'): 2}),
|
|
18
18
|
norm_counts=Counter({normalized_key: 2}),
|
|
19
|
-
norm_examples={
|
|
19
|
+
norm_examples={
|
|
20
|
+
normalized_key: Counter({'Conversion: X =3100110. 138 The record was not found in table "Teile".': 2})
|
|
21
|
+
},
|
|
20
22
|
backup_path=tmp_path / "backup" / "demo.txt.bak",
|
|
21
23
|
)
|
|
22
24
|
summary = AnalysisSummary(
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|