@nguyenphp/antigravity-marketing 1.0.18 → 1.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +186 -78
- package/package.json +4 -3
- package/templates/.agent/skills/marketing-report-expert/SKILL.md +70 -0
- package/templates/.agent/skills/minimax-docx/LICENSE +21 -0
- package/templates/.agent/skills/minimax-docx/SKILL.md +274 -0
- package/templates/.agent/skills/minimax-docx/assets/styles/academic_styles.xml +250 -0
- package/templates/.agent/skills/minimax-docx/assets/styles/corporate_styles.xml +284 -0
- package/templates/.agent/skills/minimax-docx/assets/styles/default_styles.xml +449 -0
- package/templates/.agent/skills/minimax-docx/assets/xsd/aesthetic-rules.xsd +470 -0
- package/templates/.agent/skills/minimax-docx/assets/xsd/business-rules.xsd +130 -0
- package/templates/.agent/skills/minimax-docx/assets/xsd/common-types.xsd +159 -0
- package/templates/.agent/skills/minimax-docx/assets/xsd/wml-subset.xsd +589 -0
- package/templates/.agent/skills/minimax-docx/references/cjk_typography.md +357 -0
- package/templates/.agent/skills/minimax-docx/references/cjk_university_template_guide.md +184 -0
- package/templates/.agent/skills/minimax-docx/references/comments_guide.md +191 -0
- package/templates/.agent/skills/minimax-docx/references/design_good_bad_examples.md +829 -0
- package/templates/.agent/skills/minimax-docx/references/design_principles.md +819 -0
- package/templates/.agent/skills/minimax-docx/references/openxml_element_order.md +308 -0
- package/templates/.agent/skills/minimax-docx/references/openxml_encyclopedia_part1.md +4061 -0
- package/templates/.agent/skills/minimax-docx/references/openxml_encyclopedia_part2.md +2820 -0
- package/templates/.agent/skills/minimax-docx/references/openxml_encyclopedia_part3.md +3381 -0
- package/templates/.agent/skills/minimax-docx/references/openxml_namespaces.md +82 -0
- package/templates/.agent/skills/minimax-docx/references/openxml_units.md +72 -0
- package/templates/.agent/skills/minimax-docx/references/scenario_a_create.md +284 -0
- package/templates/.agent/skills/minimax-docx/references/scenario_b_edit_content.md +295 -0
- package/templates/.agent/skills/minimax-docx/references/scenario_c_apply_template.md +456 -0
- package/templates/.agent/skills/minimax-docx/references/track_changes_guide.md +200 -0
- package/templates/.agent/skills/minimax-docx/references/troubleshooting.md +506 -0
- package/templates/.agent/skills/minimax-docx/references/typography_guide.md +294 -0
- package/templates/.agent/skills/minimax-docx/references/xsd_validation_guide.md +158 -0
- package/templates/.agent/skills/minimax-docx/scripts/doc_to_docx.sh +40 -0
- package/templates/.agent/skills/minimax-docx/scripts/docx_preview.sh +37 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Cli/MiniMaxAIDocx.Cli.csproj +19 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Cli/Program.cs +18 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Commands/AnalyzeCommand.cs +147 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Commands/ApplyTemplateCommand.cs +322 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Commands/CreateCommand.cs +324 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Commands/DiffCommand.cs +155 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Commands/EditContentCommand.cs +487 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Commands/FixOrderCommand.cs +108 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Commands/MergeRunsCommand.cs +122 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Commands/ValidateCommand.cs +107 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/MiniMaxAIDocx.Core.csproj +15 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/OpenXml/CommentSynchronizer.cs +169 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/OpenXml/ElementOrder.cs +80 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/OpenXml/NamespaceConstants.cs +42 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/OpenXml/RunMerger.cs +81 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/OpenXml/StyleAnalyzer.cs +81 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/OpenXml/TrackChangesHelper.cs +99 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/OpenXml/UnitConverter.cs +23 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Samples/AestheticRecipeSamples.cs +1832 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Samples/AestheticRecipeSamples_Batch1.cs +910 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Samples/AestheticRecipeSamples_Batch2.cs +999 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Samples/AestheticRecipeSamples_Batch3.cs +1048 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Samples/AestheticRecipeSamples_Batch4.cs +1038 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Samples/CharacterFormattingSamples.cs +1020 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Samples/DocumentCreationSamples.cs +1121 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Samples/FieldAndTocSamples.cs +624 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Samples/FootnoteAndCommentSamples.cs +675 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Samples/HeaderFooterSamples.cs +838 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Samples/ImageSamples.cs +917 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Samples/ListAndNumberingSamples.cs +826 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Samples/ParagraphFormattingSamples.cs +1199 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Samples/StyleSystemSamples.cs +1487 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Samples/TableSamples.cs +1163 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Samples/TrackChangesSamples.cs +595 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Typography/CjkHelper.cs +39 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Typography/FontDefaults.cs +24 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Typography/PageSizes.cs +20 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Validation/BusinessRuleValidator.cs +224 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Validation/GateCheckValidator.cs +148 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Validation/ValidationResult.cs +23 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.Core/Validation/XsdValidator.cs +69 -0
- package/templates/.agent/skills/minimax-docx/scripts/dotnet/MiniMaxAIDocx.slnx +4 -0
- package/templates/.agent/skills/minimax-docx/scripts/env_check.sh +196 -0
- package/templates/.agent/skills/minimax-docx/scripts/setup.ps1 +274 -0
- package/templates/.agent/skills/minimax-docx/scripts/setup.sh +504 -0
- package/templates/.agent/skills/minimax-multimodal-toolkit/SKILL.md +359 -0
- package/templates/.agent/skills/minimax-pdf/README.md +222 -0
- package/templates/.agent/skills/minimax-pdf/SKILL.md +201 -0
- package/templates/.agent/skills/minimax-pdf/design/design.md +381 -0
- package/templates/.agent/skills/minimax-pdf/scripts/cover.py +1579 -0
- package/templates/.agent/skills/minimax-pdf/scripts/fill_inspect.py +200 -0
- package/templates/.agent/skills/minimax-pdf/scripts/fill_write.py +242 -0
- package/templates/.agent/skills/minimax-pdf/scripts/make.sh +491 -0
- package/templates/.agent/skills/minimax-pdf/scripts/merge.py +112 -0
- package/templates/.agent/skills/minimax-pdf/scripts/palette.py +559 -0
- package/templates/.agent/skills/minimax-pdf/scripts/reformat_parse.py +374 -0
- package/templates/.agent/skills/minimax-pdf/scripts/render_body.py +1055 -0
- package/templates/.agent/skills/minimax-pdf/scripts/render_cover.cjs +111 -0
- package/templates/.agent/skills/minimax-xlsx/SKILL.md +138 -0
- package/templates/.agent/skills/minimax-xlsx/references/create.md +691 -0
- package/templates/.agent/skills/minimax-xlsx/references/edit.md +684 -0
- package/templates/.agent/skills/minimax-xlsx/references/fix.md +37 -0
- package/templates/.agent/skills/minimax-xlsx/references/format.md +768 -0
- package/templates/.agent/skills/minimax-xlsx/references/ooxml-cheatsheet.md +231 -0
- package/templates/.agent/skills/minimax-xlsx/references/read-analyze.md +97 -0
- package/templates/.agent/skills/minimax-xlsx/references/validate.md +772 -0
- package/templates/.agent/skills/minimax-xlsx/scripts/formula_check.py +422 -0
- package/templates/.agent/skills/minimax-xlsx/scripts/libreoffice_recalc.py +248 -0
- package/templates/.agent/skills/minimax-xlsx/scripts/shared_strings_builder.py +163 -0
- package/templates/.agent/skills/minimax-xlsx/scripts/style_audit.py +575 -0
- package/templates/.agent/skills/minimax-xlsx/scripts/xlsx_add_column.py +395 -0
- package/templates/.agent/skills/minimax-xlsx/scripts/xlsx_insert_row.py +274 -0
- package/templates/.agent/skills/minimax-xlsx/scripts/xlsx_pack.py +87 -0
- package/templates/.agent/skills/minimax-xlsx/scripts/xlsx_reader.py +362 -0
- package/templates/.agent/skills/minimax-xlsx/scripts/xlsx_shift_rows.py +396 -0
- package/templates/.agent/skills/minimax-xlsx/scripts/xlsx_unpack.py +130 -0
- package/templates/.agent/skills/minimax-xlsx/templates/minimal_xlsx/[Content_Types].xml +9 -0
- package/templates/.agent/skills/minimax-xlsx/templates/minimal_xlsx/_rels/.rels +6 -0
- package/templates/.agent/skills/minimax-xlsx/templates/minimal_xlsx/xl/_rels/workbook.xml.rels +19 -0
- package/templates/.agent/skills/minimax-xlsx/templates/minimal_xlsx/xl/sharedStrings.xml +33 -0
- package/templates/.agent/skills/minimax-xlsx/templates/minimal_xlsx/xl/styles.xml +160 -0
- package/templates/.agent/skills/minimax-xlsx/templates/minimal_xlsx/xl/workbook.xml +30 -0
- package/templates/.agent/skills/minimax-xlsx/templates/minimal_xlsx/xl/worksheets/sheet1.xml +70 -0
- package/templates/.agent/skills/pptx-generator/SKILL.md +249 -0
- package/templates/.agent/skills/pptx-generator/references/design-system.md +392 -0
- package/templates/.agent/skills/pptx-generator/references/editing.md +162 -0
- package/templates/.agent/skills/pptx-generator/references/pitfalls.md +112 -0
- package/templates/.agent/skills/pptx-generator/references/pptxgenjs.md +420 -0
- package/templates/.agent/skills/pptx-generator/references/slide-types.md +413 -0
- package/templates/.agent/skills/tutorial-video-expert/SKILL.md +88 -0
- package/templates/.agent/skills/ui-ux-pro-max/SKILL.md +170 -585
- package/templates/.agent/skills/vision-analysis/SKILL.md +174 -0
- package/templates/.agent/workflows/analyze.md +3 -0
- package/templates/.agent/workflows/brand-report.md +44 -0
- package/templates/.agent/workflows/report.md +49 -0
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
"""
|
|
4
|
+
xlsx_pack.py — Pack a working directory back into a valid xlsx file.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
python3 xlsx_pack.py <source_dir> <output.xlsx>
|
|
8
|
+
|
|
9
|
+
Requirements:
|
|
10
|
+
- source_dir must contain [Content_Types].xml at its root
|
|
11
|
+
- All XML files are re-validated for well-formedness before packing
|
|
12
|
+
|
|
13
|
+
The resulting xlsx is a valid ZIP archive with correct OOXML structure.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import sys
|
|
17
|
+
import os
|
|
18
|
+
import zipfile
|
|
19
|
+
import xml.etree.ElementTree as ET
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def validate_xml_files(source_dir: str) -> list[str]:
|
|
23
|
+
"""Return list of XML files that fail to parse."""
|
|
24
|
+
bad = []
|
|
25
|
+
for dirpath, _, filenames in os.walk(source_dir):
|
|
26
|
+
for fname in filenames:
|
|
27
|
+
if fname.endswith(".xml") or fname.endswith(".rels"):
|
|
28
|
+
fpath = os.path.join(dirpath, fname)
|
|
29
|
+
try:
|
|
30
|
+
ET.parse(fpath)
|
|
31
|
+
except ET.ParseError as e:
|
|
32
|
+
rel = os.path.relpath(fpath, source_dir)
|
|
33
|
+
bad.append(f"{rel}: {e}")
|
|
34
|
+
return bad
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def pack(source_dir: str, xlsx_path: str) -> None:
|
|
38
|
+
if not os.path.isdir(source_dir):
|
|
39
|
+
print(f"ERROR: Directory not found: {source_dir}", file=sys.stderr)
|
|
40
|
+
sys.exit(1)
|
|
41
|
+
|
|
42
|
+
content_types = os.path.join(source_dir, "[Content_Types].xml")
|
|
43
|
+
if not os.path.isfile(content_types):
|
|
44
|
+
print(
|
|
45
|
+
f"ERROR: Missing [Content_Types].xml in {source_dir}\n"
|
|
46
|
+
" This file is required at the root of every valid xlsx package.",
|
|
47
|
+
file=sys.stderr,
|
|
48
|
+
)
|
|
49
|
+
sys.exit(1)
|
|
50
|
+
|
|
51
|
+
# Validate XML well-formedness before packing
|
|
52
|
+
print("Validating XML files...")
|
|
53
|
+
bad_files = validate_xml_files(source_dir)
|
|
54
|
+
if bad_files:
|
|
55
|
+
print("ERROR: The following files have XML parse errors:", file=sys.stderr)
|
|
56
|
+
for b in bad_files:
|
|
57
|
+
print(f" {b}", file=sys.stderr)
|
|
58
|
+
print(
|
|
59
|
+
"\nFix all XML errors before packing. "
|
|
60
|
+
"A malformed xlsx cannot be opened by Excel or LibreOffice.",
|
|
61
|
+
file=sys.stderr,
|
|
62
|
+
)
|
|
63
|
+
sys.exit(1)
|
|
64
|
+
|
|
65
|
+
print("✓ All XML files are well-formed")
|
|
66
|
+
|
|
67
|
+
# Count files to pack
|
|
68
|
+
file_count = sum(len(files) for _, _, files in os.walk(source_dir))
|
|
69
|
+
|
|
70
|
+
with zipfile.ZipFile(xlsx_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
|
|
71
|
+
for dirpath, _, filenames in os.walk(source_dir):
|
|
72
|
+
for fname in filenames:
|
|
73
|
+
fpath = os.path.join(dirpath, fname)
|
|
74
|
+
arcname = os.path.relpath(fpath, source_dir)
|
|
75
|
+
z.write(fpath, arcname)
|
|
76
|
+
|
|
77
|
+
size = os.path.getsize(xlsx_path)
|
|
78
|
+
print(f"Packed {file_count} files → '{xlsx_path}' ({size:,} bytes)")
|
|
79
|
+
print("\nNext step: run formula_check.py to validate formulas:")
|
|
80
|
+
print(f" python3 formula_check.py {xlsx_path}")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
if __name__ == "__main__":
|
|
84
|
+
if len(sys.argv) != 3:
|
|
85
|
+
print("Usage: xlsx_pack.py <source_dir> <output.xlsx>")
|
|
86
|
+
sys.exit(1)
|
|
87
|
+
pack(sys.argv[1], sys.argv[2])
|
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
"""
|
|
4
|
+
xlsx_reader.py — Structure discovery and data analysis tool for Excel/CSV files.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
python3 xlsx_reader.py <file> # full structure report
|
|
8
|
+
python3 xlsx_reader.py <file> --sheet Sales # analyze one sheet
|
|
9
|
+
python3 xlsx_reader.py <file> --json # machine-readable output
|
|
10
|
+
python3 xlsx_reader.py <file> --quality # data quality audit only
|
|
11
|
+
|
|
12
|
+
Supports: .xlsx, .xlsm, .csv, .tsv
|
|
13
|
+
Does NOT modify the source file in any way.
|
|
14
|
+
|
|
15
|
+
Exit codes:
|
|
16
|
+
0 — success
|
|
17
|
+
1 — file not found / unsupported format / encoding failure
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import sys
|
|
21
|
+
import json
|
|
22
|
+
import argparse
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
# Format detection and loading
|
|
28
|
+
# ---------------------------------------------------------------------------
|
|
29
|
+
|
|
30
|
+
def detect_and_load(file_path: str, sheet_name_filter: str | None = None) -> dict:
|
|
31
|
+
"""
|
|
32
|
+
Load file into {sheet_name: DataFrame} dict.
|
|
33
|
+
CSV/TSV files are mapped to a single-key dict using the file stem as key.
|
|
34
|
+
|
|
35
|
+
Raises ValueError for unsupported formats or encoding failures.
|
|
36
|
+
"""
|
|
37
|
+
try:
|
|
38
|
+
import pandas as pd
|
|
39
|
+
except ImportError:
|
|
40
|
+
raise RuntimeError(
|
|
41
|
+
"pandas is not installed. Run: pip install pandas openpyxl"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
path = Path(file_path)
|
|
45
|
+
if not path.exists():
|
|
46
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
47
|
+
|
|
48
|
+
suffix = path.suffix.lower()
|
|
49
|
+
|
|
50
|
+
if suffix in (".xlsx", ".xlsm"):
|
|
51
|
+
target = sheet_name_filter if sheet_name_filter else None
|
|
52
|
+
result = pd.read_excel(file_path, sheet_name=target)
|
|
53
|
+
# pd.read_excel with sheet_name=None returns dict; with a name, returns DataFrame
|
|
54
|
+
if isinstance(result, dict):
|
|
55
|
+
return result
|
|
56
|
+
else:
|
|
57
|
+
return {sheet_name_filter: result}
|
|
58
|
+
|
|
59
|
+
elif suffix in (".csv", ".tsv"):
|
|
60
|
+
sep = "\t" if suffix == ".tsv" else ","
|
|
61
|
+
encodings = ["utf-8-sig", "gbk", "utf-8", "latin-1"]
|
|
62
|
+
last_error = None
|
|
63
|
+
for enc in encodings:
|
|
64
|
+
try:
|
|
65
|
+
import pandas as pd
|
|
66
|
+
df = pd.read_csv(file_path, sep=sep, encoding=enc)
|
|
67
|
+
df._reader_encoding = enc # attach metadata (non-standard, for reporting)
|
|
68
|
+
return {path.stem: df}
|
|
69
|
+
except (UnicodeDecodeError, Exception) as e:
|
|
70
|
+
last_error = e
|
|
71
|
+
continue
|
|
72
|
+
raise ValueError(
|
|
73
|
+
f"Cannot decode {file_path}. Tried encodings: {encodings}. "
|
|
74
|
+
f"Last error: {last_error}"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
elif suffix == ".xls":
|
|
78
|
+
raise ValueError(
|
|
79
|
+
".xls is a legacy binary format not supported by this tool. "
|
|
80
|
+
"Please open the file in Excel and save as .xlsx, then retry."
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
else:
|
|
84
|
+
raise ValueError(
|
|
85
|
+
f"Unsupported file format: {suffix}. "
|
|
86
|
+
"Supported formats: .xlsx, .xlsm, .csv, .tsv"
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# ---------------------------------------------------------------------------
|
|
91
|
+
# Structure discovery
|
|
92
|
+
# ---------------------------------------------------------------------------
|
|
93
|
+
|
|
94
|
+
def explore_structure(sheets: dict) -> dict:
|
|
95
|
+
"""
|
|
96
|
+
Return a structured dict describing each sheet.
|
|
97
|
+
Keys: sheet_name -> {shape, columns, dtypes, null_counts, preview}
|
|
98
|
+
"""
|
|
99
|
+
result = {}
|
|
100
|
+
for sheet_name, df in sheets.items():
|
|
101
|
+
null_counts = df.isnull().sum()
|
|
102
|
+
null_info = {
|
|
103
|
+
col: {"count": int(cnt), "pct": round(cnt / max(len(df), 1) * 100, 1)}
|
|
104
|
+
for col, cnt in null_counts.items()
|
|
105
|
+
if cnt > 0
|
|
106
|
+
}
|
|
107
|
+
result[sheet_name] = {
|
|
108
|
+
"shape": {"rows": df.shape[0], "cols": df.shape[1]},
|
|
109
|
+
"columns": list(df.columns),
|
|
110
|
+
"dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
|
|
111
|
+
"null_columns": null_info,
|
|
112
|
+
"preview": df.head(5).to_dict(orient="records"),
|
|
113
|
+
}
|
|
114
|
+
return result
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# ---------------------------------------------------------------------------
|
|
118
|
+
# Data quality audit
|
|
119
|
+
# ---------------------------------------------------------------------------
|
|
120
|
+
|
|
121
|
+
def audit_quality(sheets: dict) -> dict:
|
|
122
|
+
"""
|
|
123
|
+
Return data quality findings per sheet.
|
|
124
|
+
Checks: nulls, duplicates, mixed-type columns, potential year formatting issues.
|
|
125
|
+
"""
|
|
126
|
+
import pandas as pd
|
|
127
|
+
|
|
128
|
+
findings = {}
|
|
129
|
+
for sheet_name, df in sheets.items():
|
|
130
|
+
sheet_findings = []
|
|
131
|
+
|
|
132
|
+
# Null values
|
|
133
|
+
null_counts = df.isnull().sum()
|
|
134
|
+
for col, cnt in null_counts.items():
|
|
135
|
+
if cnt > 0:
|
|
136
|
+
pct = round(cnt / max(len(df), 1) * 100, 1)
|
|
137
|
+
sheet_findings.append({
|
|
138
|
+
"type": "null_values",
|
|
139
|
+
"column": col,
|
|
140
|
+
"count": int(cnt),
|
|
141
|
+
"pct": pct,
|
|
142
|
+
"note": f"Column '{col}' has {cnt} null values ({pct}%). "
|
|
143
|
+
"If this column contains Excel formulas, null values may "
|
|
144
|
+
"indicate that the formula cache has not been populated "
|
|
145
|
+
"(file was never opened in Excel after the formulas were written)."
|
|
146
|
+
})
|
|
147
|
+
|
|
148
|
+
# Duplicate rows
|
|
149
|
+
dup_count = int(df.duplicated().sum())
|
|
150
|
+
if dup_count > 0:
|
|
151
|
+
sheet_findings.append({
|
|
152
|
+
"type": "duplicate_rows",
|
|
153
|
+
"count": dup_count,
|
|
154
|
+
"note": f"{dup_count} fully duplicate rows found."
|
|
155
|
+
})
|
|
156
|
+
|
|
157
|
+
# Mixed-type object columns (numeric data stored as text)
|
|
158
|
+
for col in df.select_dtypes(include="object").columns:
|
|
159
|
+
numeric_converted = pd.to_numeric(df[col], errors="coerce")
|
|
160
|
+
convertible = int(numeric_converted.notna().sum())
|
|
161
|
+
non_null_total = int(df[col].notna().sum())
|
|
162
|
+
if 0 < convertible < non_null_total:
|
|
163
|
+
sheet_findings.append({
|
|
164
|
+
"type": "mixed_type",
|
|
165
|
+
"column": col,
|
|
166
|
+
"convertible_to_numeric": convertible,
|
|
167
|
+
"non_convertible": non_null_total - convertible,
|
|
168
|
+
"note": f"Column '{col}' appears to contain mixed types: "
|
|
169
|
+
f"{convertible} values can be parsed as numbers, "
|
|
170
|
+
f"{non_null_total - convertible} cannot. "
|
|
171
|
+
"Use pd.to_numeric(df[col], errors='coerce') to unify."
|
|
172
|
+
})
|
|
173
|
+
|
|
174
|
+
# Year column formatting (e.g., 2024.0 stored as float)
|
|
175
|
+
for col in df.select_dtypes(include="number").columns:
|
|
176
|
+
col_lower = str(col).lower()
|
|
177
|
+
# "年" is the Chinese character for "year" — detect year columns in CJK spreadsheets
|
|
178
|
+
if "year" in col_lower or "yr" in col_lower or "年" in col_lower:
|
|
179
|
+
if df[col].dropna().between(1900, 2200).all():
|
|
180
|
+
if df[col].dtype == float:
|
|
181
|
+
sheet_findings.append({
|
|
182
|
+
"type": "year_as_float",
|
|
183
|
+
"column": col,
|
|
184
|
+
"note": f"Column '{col}' appears to be a year column stored as float "
|
|
185
|
+
"(e.g., 2024.0). Convert with df[col].astype(int).astype(str) "
|
|
186
|
+
"to get clean year strings like '2024'."
|
|
187
|
+
})
|
|
188
|
+
|
|
189
|
+
# Outliers via IQR on numeric columns
|
|
190
|
+
for col in df.select_dtypes(include="number").columns:
|
|
191
|
+
series = df[col].dropna()
|
|
192
|
+
if len(series) < 4:
|
|
193
|
+
continue
|
|
194
|
+
Q1, Q3 = series.quantile(0.25), series.quantile(0.75)
|
|
195
|
+
IQR = Q3 - Q1
|
|
196
|
+
if IQR == 0:
|
|
197
|
+
continue
|
|
198
|
+
outlier_mask = (df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)
|
|
199
|
+
outlier_count = int(outlier_mask.sum())
|
|
200
|
+
if outlier_count > 0:
|
|
201
|
+
sheet_findings.append({
|
|
202
|
+
"type": "outliers_iqr",
|
|
203
|
+
"column": col,
|
|
204
|
+
"count": outlier_count,
|
|
205
|
+
"note": f"Column '{col}' has {outlier_count} potential outlier(s) "
|
|
206
|
+
f"(outside 1.5×IQR bounds: [{Q1 - 1.5*IQR:.2f}, {Q3 + 1.5*IQR:.2f}])."
|
|
207
|
+
})
|
|
208
|
+
|
|
209
|
+
findings[sheet_name] = sheet_findings
|
|
210
|
+
|
|
211
|
+
return findings
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
# ---------------------------------------------------------------------------
|
|
215
|
+
# Summary statistics
|
|
216
|
+
# ---------------------------------------------------------------------------
|
|
217
|
+
|
|
218
|
+
def compute_stats(sheets: dict) -> dict:
|
|
219
|
+
"""Compute descriptive statistics for numeric columns per sheet."""
|
|
220
|
+
stats = {}
|
|
221
|
+
for sheet_name, df in sheets.items():
|
|
222
|
+
numeric_df = df.select_dtypes(include="number")
|
|
223
|
+
if numeric_df.empty:
|
|
224
|
+
stats[sheet_name] = {}
|
|
225
|
+
continue
|
|
226
|
+
desc = numeric_df.describe().round(4)
|
|
227
|
+
stats[sheet_name] = desc.to_dict()
|
|
228
|
+
return stats
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
# ---------------------------------------------------------------------------
|
|
232
|
+
# Human-readable report rendering
|
|
233
|
+
# ---------------------------------------------------------------------------
|
|
234
|
+
|
|
235
|
+
def render_report(
|
|
236
|
+
file_path: str,
|
|
237
|
+
structure: dict,
|
|
238
|
+
quality: dict,
|
|
239
|
+
stats: dict,
|
|
240
|
+
) -> str:
|
|
241
|
+
lines = []
|
|
242
|
+
p = lines.append
|
|
243
|
+
|
|
244
|
+
p("=" * 60)
|
|
245
|
+
p(f"ANALYSIS REPORT: {Path(file_path).name}")
|
|
246
|
+
p("=" * 60)
|
|
247
|
+
|
|
248
|
+
# File overview
|
|
249
|
+
sheet_list = list(structure.keys())
|
|
250
|
+
total_rows = sum(s["shape"]["rows"] for s in structure.values())
|
|
251
|
+
p(f"\nSheets ({len(sheet_list)}): {', '.join(sheet_list)}")
|
|
252
|
+
p(f"Total rows across all sheets: {total_rows:,}")
|
|
253
|
+
|
|
254
|
+
for sheet_name, info in structure.items():
|
|
255
|
+
p(f"\n{'─' * 50}")
|
|
256
|
+
p(f"Sheet: {sheet_name}")
|
|
257
|
+
p(f"{'─' * 50}")
|
|
258
|
+
p(f" Size: {info['shape']['rows']:,} rows × {info['shape']['cols']} cols")
|
|
259
|
+
p(f" Columns: {info['columns']}")
|
|
260
|
+
|
|
261
|
+
# Data types
|
|
262
|
+
p("\n Column types:")
|
|
263
|
+
for col, dtype in info["dtypes"].items():
|
|
264
|
+
p(f" {col}: {dtype}")
|
|
265
|
+
|
|
266
|
+
# Nulls
|
|
267
|
+
if info["null_columns"]:
|
|
268
|
+
p("\n Null values (columns with nulls only):")
|
|
269
|
+
for col, null_info in info["null_columns"].items():
|
|
270
|
+
p(f" {col}: {null_info['count']} nulls ({null_info['pct']}%)")
|
|
271
|
+
else:
|
|
272
|
+
p("\n Null values: none")
|
|
273
|
+
|
|
274
|
+
# Stats
|
|
275
|
+
sheet_stats = stats.get(sheet_name, {})
|
|
276
|
+
if sheet_stats:
|
|
277
|
+
p("\n Numeric column statistics:")
|
|
278
|
+
numeric_cols = list(sheet_stats.keys())
|
|
279
|
+
# Show only first 6 to keep report readable
|
|
280
|
+
for col in numeric_cols[:6]:
|
|
281
|
+
col_stats = sheet_stats[col]
|
|
282
|
+
p(f" {col}:")
|
|
283
|
+
p(f" count={col_stats.get('count', 'N/A')} "
|
|
284
|
+
f"mean={col_stats.get('mean', 'N/A')} "
|
|
285
|
+
f"min={col_stats.get('min', 'N/A')} "
|
|
286
|
+
f"max={col_stats.get('max', 'N/A')}")
|
|
287
|
+
if len(numeric_cols) > 6:
|
|
288
|
+
p(f" ... and {len(numeric_cols) - 6} more numeric columns")
|
|
289
|
+
|
|
290
|
+
# Quality findings for this sheet
|
|
291
|
+
sheet_quality = quality.get(sheet_name, [])
|
|
292
|
+
if sheet_quality:
|
|
293
|
+
p(f"\n Data quality issues ({len(sheet_quality)} found):")
|
|
294
|
+
for finding in sheet_quality:
|
|
295
|
+
p(f" [{finding['type'].upper()}] {finding['note']}")
|
|
296
|
+
else:
|
|
297
|
+
p("\n Data quality: no issues found")
|
|
298
|
+
|
|
299
|
+
# Preview
|
|
300
|
+
if info["preview"]:
|
|
301
|
+
p("\n Preview (first 3 rows):")
|
|
302
|
+
import pandas as pd
|
|
303
|
+
preview_df = pd.DataFrame(info["preview"][:3])
|
|
304
|
+
for line in preview_df.to_string(index=False).splitlines():
|
|
305
|
+
p(f" {line}")
|
|
306
|
+
|
|
307
|
+
p("\n" + "=" * 60)
|
|
308
|
+
quality_issue_count = sum(len(v) for v in quality.values())
|
|
309
|
+
if quality_issue_count == 0:
|
|
310
|
+
p("RESULT: No data quality issues detected.")
|
|
311
|
+
else:
|
|
312
|
+
p(f"RESULT: {quality_issue_count} data quality issue(s) found. See details above.")
|
|
313
|
+
p("=" * 60)
|
|
314
|
+
|
|
315
|
+
return "\n".join(lines)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
# ---------------------------------------------------------------------------
|
|
319
|
+
# CLI entry point
|
|
320
|
+
# ---------------------------------------------------------------------------
|
|
321
|
+
|
|
322
|
+
def main() -> None:
|
|
323
|
+
parser = argparse.ArgumentParser(
|
|
324
|
+
description="Read and analyze Excel/CSV files without modifying them."
|
|
325
|
+
)
|
|
326
|
+
parser.add_argument("file", help="Path to .xlsx, .xlsm, .csv, or .tsv file")
|
|
327
|
+
parser.add_argument("--sheet", help="Analyze a specific sheet only", default=None)
|
|
328
|
+
parser.add_argument(
|
|
329
|
+
"--json", action="store_true", help="Output machine-readable JSON"
|
|
330
|
+
)
|
|
331
|
+
parser.add_argument(
|
|
332
|
+
"--quality", action="store_true",
|
|
333
|
+
help="Run data quality audit only (skip stats)"
|
|
334
|
+
)
|
|
335
|
+
args = parser.parse_args()
|
|
336
|
+
|
|
337
|
+
try:
|
|
338
|
+
sheets = detect_and_load(args.file, sheet_name_filter=args.sheet)
|
|
339
|
+
except (FileNotFoundError, ValueError, RuntimeError) as e:
|
|
340
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
341
|
+
sys.exit(1)
|
|
342
|
+
|
|
343
|
+
structure = explore_structure(sheets)
|
|
344
|
+
quality = audit_quality(sheets)
|
|
345
|
+
stats = {} if args.quality else compute_stats(sheets)
|
|
346
|
+
|
|
347
|
+
if args.json:
|
|
348
|
+
output = {
|
|
349
|
+
"file": args.file,
|
|
350
|
+
"structure": structure,
|
|
351
|
+
"quality": quality,
|
|
352
|
+
"stats": stats,
|
|
353
|
+
}
|
|
354
|
+
# Convert preview records to serializable form (handle non-JSON types)
|
|
355
|
+
print(json.dumps(output, indent=2, ensure_ascii=False, default=str))
|
|
356
|
+
else:
|
|
357
|
+
report = render_report(args.file, structure, quality, stats)
|
|
358
|
+
print(report)
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
if __name__ == "__main__":
|
|
362
|
+
main()
|