@panda-agent/panda-cli 0.1.29 → 0.1.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/pandacli.mjs +6 -1
- package/bundled-preset-skills/.gitkeep +0 -0
- package/bundled-preset-skills/README.md +17 -0
- package/bundled-preset-skills/docx/.skill-metadata.yaml +173 -0
- package/bundled-preset-skills/docx/LICENSE.txt +30 -0
- package/bundled-preset-skills/docx/SKILL.md +589 -0
- package/bundled-preset-skills/docx/scripts/__init__.py +1 -0
- package/bundled-preset-skills/docx/scripts/accept_changes.py +206 -0
- package/bundled-preset-skills/docx/scripts/comment.py +442 -0
- package/bundled-preset-skills/docx/scripts/office/helpers/__init__.py +1 -0
- package/bundled-preset-skills/docx/scripts/office/helpers/merge_runs.py +190 -0
- package/bundled-preset-skills/docx/scripts/office/helpers/simplify_redlines.py +185 -0
- package/bundled-preset-skills/docx/scripts/office/pack.py +167 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/mce/mc.xsd +75 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/bundled-preset-skills/docx/scripts/office/soffice.py +194 -0
- package/bundled-preset-skills/docx/scripts/office/unpack.py +145 -0
- package/bundled-preset-skills/docx/scripts/office/validate.py +114 -0
- package/bundled-preset-skills/docx/scripts/office/validators/__init__.py +16 -0
- package/bundled-preset-skills/docx/scripts/office/validators/base.py +733 -0
- package/bundled-preset-skills/docx/scripts/office/validators/docx.py +354 -0
- package/bundled-preset-skills/docx/scripts/office/validators/pptx.py +230 -0
- package/bundled-preset-skills/docx/scripts/office/validators/redlining.py +212 -0
- package/bundled-preset-skills/docx/scripts/templates/comments.xml +3 -0
- package/bundled-preset-skills/docx/scripts/templates/commentsExtended.xml +3 -0
- package/bundled-preset-skills/docx/scripts/templates/commentsExtensible.xml +3 -0
- package/bundled-preset-skills/docx/scripts/templates/commentsIds.xml +3 -0
- package/bundled-preset-skills/docx/scripts/templates/people.xml +3 -0
- package/bundled-preset-skills/frontend-design/LICENSE.txt +177 -0
- package/bundled-preset-skills/frontend-design/SKILL.md +42 -0
- package/bundled-preset-skills/pdf/.skill-metadata.yaml +273 -0
- package/bundled-preset-skills/pdf/LICENSE.txt +30 -0
- package/bundled-preset-skills/pdf/SKILL.md +324 -0
- package/bundled-preset-skills/pdf/advanced-reference.md +609 -0
- package/bundled-preset-skills/pdf/form-filling-guide.md +318 -0
- package/bundled-preset-skills/pdf/forms.md +294 -0
- package/bundled-preset-skills/pdf/reference.md +612 -0
- package/bundled-preset-skills/pdf/scripts/check_bounding_boxes.py +198 -0
- package/bundled-preset-skills/pdf/scripts/check_fillable_fields.py +64 -0
- package/bundled-preset-skills/pdf/scripts/convert_pdf_to_images.py +102 -0
- package/bundled-preset-skills/pdf/scripts/create_validation_image.py +125 -0
- package/bundled-preset-skills/pdf/scripts/extract_form_field_info.py +220 -0
- package/bundled-preset-skills/pdf/scripts/extract_form_structure.py +202 -0
- package/bundled-preset-skills/pdf/scripts/fill_fillable_fields.py +205 -0
- package/bundled-preset-skills/pdf/scripts/fill_pdf_form_with_annotations.py +193 -0
- package/bundled-preset-skills/pptx-generator/SKILL.md +204 -0
- package/bundled-preset-skills/pptx-generator/assets/styles/business.json +8 -0
- package/bundled-preset-skills/pptx-generator/assets/styles/minimal.json +8 -0
- package/bundled-preset-skills/pptx-generator/assets/styles/modern.json +8 -0
- package/bundled-preset-skills/pptx-generator/assets/templates/ppt_data_template.json +40 -0
- package/bundled-preset-skills/pptx-generator/references/collaboration_guide.md +381 -0
- package/bundled-preset-skills/pptx-generator/references/json_format_spec.md +215 -0
- package/bundled-preset-skills/pptx-generator/references/layout_guide.md +290 -0
- package/bundled-preset-skills/pptx-generator/scripts/json_validator.py +194 -0
- package/bundled-preset-skills/pptx-generator/scripts/pptx_builder.py +340 -0
- package/bundled-preset-skills/pptx-generator/scripts/pptx_validator.py +162 -0
- package/bundled-preset-skills/skill-creator/LICENSE.txt +202 -0
- package/bundled-preset-skills/skill-creator/SKILL.md +479 -0
- package/bundled-preset-skills/skill-creator/agents/analyzer.md +274 -0
- package/bundled-preset-skills/skill-creator/agents/comparator.md +202 -0
- package/bundled-preset-skills/skill-creator/agents/grader.md +223 -0
- package/bundled-preset-skills/skill-creator/assets/eval_review.html +146 -0
- package/bundled-preset-skills/skill-creator/eval-viewer/generate_review.py +471 -0
- package/bundled-preset-skills/skill-creator/eval-viewer/viewer.html +1325 -0
- package/bundled-preset-skills/skill-creator/references/schemas.md +430 -0
- package/bundled-preset-skills/skill-creator/scripts/__init__.py +0 -0
- package/bundled-preset-skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
- package/bundled-preset-skills/skill-creator/scripts/generate_report.py +326 -0
- package/bundled-preset-skills/skill-creator/scripts/improve_description.py +248 -0
- package/bundled-preset-skills/skill-creator/scripts/package_skill.py +136 -0
- package/bundled-preset-skills/skill-creator/scripts/quick_validate.py +103 -0
- package/bundled-preset-skills/skill-creator/scripts/run_eval.py +310 -0
- package/bundled-preset-skills/skill-creator/scripts/run_loop.py +332 -0
- package/bundled-preset-skills/skill-creator/scripts/utils.py +47 -0
- package/bundled-preset-skills/xlsx/.skill-metadata.yaml +185 -0
- package/bundled-preset-skills/xlsx/LICENSE.txt +30 -0
- package/bundled-preset-skills/xlsx/SKILL.md +233 -0
- package/bundled-preset-skills/xlsx/scripts/office/helpers/__init__.py +1 -0
- package/bundled-preset-skills/xlsx/scripts/office/helpers/merge_runs.py +226 -0
- package/bundled-preset-skills/xlsx/scripts/office/helpers/simplify_redlines.py +198 -0
- package/bundled-preset-skills/xlsx/scripts/office/pack.py +162 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/mce/mc.xsd +75 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/bundled-preset-skills/xlsx/scripts/office/soffice.py +185 -0
- package/bundled-preset-skills/xlsx/scripts/office/unpack.py +146 -0
- package/bundled-preset-skills/xlsx/scripts/office/validate.py +108 -0
- package/bundled-preset-skills/xlsx/scripts/office/validators/__init__.py +13 -0
- package/bundled-preset-skills/xlsx/scripts/office/validators/base.py +800 -0
- package/bundled-preset-skills/xlsx/scripts/office/validators/docx.py +383 -0
- package/bundled-preset-skills/xlsx/scripts/office/validators/pptx.py +250 -0
- package/bundled-preset-skills/xlsx/scripts/office/validators/redlining.py +229 -0
- package/bundled-preset-skills/xlsx/scripts/recalc.py +296 -0
- package/dist/panda-cli-ink.bundle.mjs +276 -342
- package/package.json +6 -4
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: xlsx
|
|
3
|
+
version: 1.0.1
|
|
4
|
+
description: "Use this skill any time a spreadsheet file is the primary input or output. This means any task where the user wants to: open, read, edit, or fix an existing .xlsx, .xlsm, .csv, or .tsv file (e.g., adding columns, computing formulas, formatting, charting, cleaning messy data); create a new spreadsheet from scratch or from other data sources; or convert between tabular file formats. Trigger especially when the user references a spreadsheet file by name or path — even casually (like \"the xlsx in my downloads\") — and wants something done to it or produced from it. Also trigger for cleaning or restructuring messy tabular data files (malformed rows, misplaced headers, junk data) into proper spreadsheets. The deliverable must be a spreadsheet file. Do NOT trigger when the primary deliverable is a Word document, HTML report, standalone Python script, database pipeline, or Google Sheets API integration, even if tabular data is involved."
|
|
5
|
+
description_zh: "当电子表格文件是主要输入或输出时使用此技能。包括:打开、读取、编辑或修复现有的 .xlsx、.xlsm、.csv 或 .tsv 文件(如添加列、计算公式、格式化、图表、清洗数据);从零或其他数据源创建新电子表格;在表格文件格式之间转换。当用户提及电子表格文件名或路径时触发——即使是随意提及(如\"下载文件夹里的 xlsx\")——并希望对其进行操作或生成电子表格。也适用于将混乱的表格数据文件(格式错误的行、错位的表头、垃圾数据)清理重组为规范的电子表格。交付物必须是电子表格文件。当主要交付物是 Word 文档、HTML 报告、独立 Python 脚本、数据库管道或 Google Sheets API 集成时,即使涉及表格数据也不要触发。"
|
|
6
|
+
license: Proprietary. LICENSE.txt has complete terms
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Spreadsheet Creation, Editing, and Analysis
|
|
10
|
+
|
|
11
|
+
You have access to multiple tools and workflows for working with `.xlsx` files — from reading and analysing data, through programmatic creation and editing, to formula recalculation and error checking.
|
|
12
|
+
|
|
13
|
+
## Tooling Primer
|
|
14
|
+
|
|
15
|
+
| Library | Best for |
|
|
16
|
+
|---------|----------|
|
|
17
|
+
| **pandas** | Bulk data manipulation, statistical analysis, quick CSV↔XLSX conversion |
|
|
18
|
+
| **openpyxl** | Cell-level formatting, Excel formulas, charts, conditional formatting |
|
|
19
|
+
|
|
20
|
+
## ⛔ CRITICAL — Use Formulas, Never Hardcode Calculations
|
|
21
|
+
|
|
22
|
+
> **🚨 MANDATORY RULE — ZERO EXCEPTIONS 🚨**
|
|
23
|
+
>
|
|
24
|
+
> **Every computed value must be an Excel formula, not a Python-calculated literal.**
|
|
25
|
+
> This keeps the workbook dynamic and self-updating.
|
|
26
|
+
> Violations will produce stale, non-updating spreadsheets.
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
# ── WRONG — baking Python results into cells ──
|
|
30
|
+
total = df['Sales'].sum()
|
|
31
|
+
ws['B10'] = total # static 5000
|
|
32
|
+
|
|
33
|
+
growth = (df.iloc[-1]['Revenue'] - df.iloc[0]['Revenue']) / df.iloc[0]['Revenue']
|
|
34
|
+
ws['C5'] = growth # static 0.15
|
|
35
|
+
|
|
36
|
+
avg = sum(vals) / len(vals)
|
|
37
|
+
ws['D20'] = avg # static 42.5
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
# ── RIGHT — let Excel do the maths ──
|
|
42
|
+
ws['B10'] = '=SUM(B2:B9)'
|
|
43
|
+
ws['C5'] = '=(C4-C2)/C2'
|
|
44
|
+
ws['D20'] = '=AVERAGE(D2:D19)'
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
This rule applies to **all** calculations — sums, ratios, percentages, differences, etc.
|
|
48
|
+
|
|
49
|
+
## Step-by-Step Workflow
|
|
50
|
+
|
|
51
|
+
1. **Pick a library** — pandas for data; openpyxl for formatting / formulas.
|
|
52
|
+
2. **Open or create** the workbook.
|
|
53
|
+
3. **Modify** — add/edit data, formulas, and styles.
|
|
54
|
+
4. **Save** to disk.
|
|
55
|
+
5. **Recalculate** (mandatory when formulas are present):
|
|
56
|
+
```bash
|
|
57
|
+
python scripts/recalc.py output.xlsx
|
|
58
|
+
```
|
|
59
|
+
6. **Inspect the JSON output** and fix any errors:
|
|
60
|
+
- `status: "errors_found"` → see `error_summary` for types and locations.
|
|
61
|
+
- Common errors: `#REF!` (bad reference), `#DIV/0!` (zero denominator), `#VALUE!` (type mismatch), `#NAME?` (unknown function).
|
|
62
|
+
|
|
63
|
+
## Reading & Analysing Data
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
import pandas as pd
|
|
67
|
+
|
|
68
|
+
df = pd.read_excel('file.xlsx') # first sheet
|
|
69
|
+
sheets = pd.read_excel('file.xlsx', sheet_name=None) # all sheets → dict
|
|
70
|
+
|
|
71
|
+
df.head(); df.info(); df.describe() # quick overview
|
|
72
|
+
|
|
73
|
+
df.to_excel('result.xlsx', index=False) # write back
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Building New Workbooks
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from openpyxl import Workbook
|
|
80
|
+
from openpyxl.styles import Font, PatternFill, Alignment
|
|
81
|
+
|
|
82
|
+
wb = Workbook()
|
|
83
|
+
ws = wb.active
|
|
84
|
+
|
|
85
|
+
ws['A1'] = 'Header'
|
|
86
|
+
ws.append(['Row', 'of', 'data'])
|
|
87
|
+
ws['B2'] = '=SUM(A1:A10)'
|
|
88
|
+
|
|
89
|
+
ws['A1'].font = Font(bold=True, color='FF0000')
|
|
90
|
+
ws['A1'].fill = PatternFill('solid', start_color='FFFF00')
|
|
91
|
+
ws['A1'].alignment = Alignment(horizontal='center')
|
|
92
|
+
ws.column_dimensions['A'].width = 20
|
|
93
|
+
|
|
94
|
+
wb.save('output.xlsx')
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Modifying Existing Files
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
from openpyxl import load_workbook
|
|
101
|
+
|
|
102
|
+
wb = load_workbook('existing.xlsx')
|
|
103
|
+
ws = wb.active # or wb['SheetName']
|
|
104
|
+
|
|
105
|
+
for name in wb.sheetnames:
|
|
106
|
+
print("Sheet: {}".format(name))
|
|
107
|
+
|
|
108
|
+
ws['A1'] = 'Updated'
|
|
109
|
+
ws.insert_rows(2)
|
|
110
|
+
ws.delete_cols(3)
|
|
111
|
+
|
|
112
|
+
extra = wb.create_sheet('Extra')
|
|
113
|
+
extra['A1'] = 'New data'
|
|
114
|
+
|
|
115
|
+
wb.save('modified.xlsx')
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Formula Recalculation
|
|
119
|
+
|
|
120
|
+
Workbooks produced by openpyxl contain formula *strings* but no cached results. Use the bundled helper to populate those values:
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
python scripts/recalc.py <excel_file> [timeout_seconds]
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
**What it does:**
|
|
127
|
+
- Deploys a LibreOffice Basic macro (first run only)
|
|
128
|
+
- Invokes LibreOffice headless to recalculate every formula
|
|
129
|
+
- Scans all cells for Excel error markers
|
|
130
|
+
- Emits structured JSON
|
|
131
|
+
|
|
132
|
+
**Prerequisite:** LibreOffice must be installed. The helper handles first-run configuration automatically, including sandboxed environments where Unix sockets are restricted (via `scripts/office/soffice.py`).
|
|
133
|
+
|
|
134
|
+
### Interpreting the Output
|
|
135
|
+
|
|
136
|
+
```json
|
|
137
|
+
{
|
|
138
|
+
"status": "success",
|
|
139
|
+
"total_errors": 0,
|
|
140
|
+
"total_formulas": 42,
|
|
141
|
+
"error_summary": {}
|
|
142
|
+
}
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
When `status` is `"errors_found"`, `error_summary` lists each error type with count and cell locations (up to 20 per type).
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
# Output Quality Standards
|
|
150
|
+
|
|
151
|
+
## General — All Workbooks
|
|
152
|
+
|
|
153
|
+
| Area | Requirement |
|
|
154
|
+
|------|-------------|
|
|
155
|
+
| **Typography** | Use a single professional typeface (Arial, Times New Roman, …) throughout, unless the user specifies otherwise |
|
|
156
|
+
| **Error-free delivery** | Zero formula errors — no `#REF!`, `#DIV/0!`, `#VALUE!`, `#N/A`, `#NAME?` |
|
|
157
|
+
| **Template fidelity** | When editing an existing file, study and replicate its formatting conventions exactly; never impose a different style on an already-patterned workbook |
|
|
158
|
+
|
|
159
|
+
## Financial Models
|
|
160
|
+
|
|
161
|
+
### Colour Conventions (override only when the user or template says otherwise)
|
|
162
|
+
|
|
163
|
+
| Element | RGB | Hex | Meaning |
|
|
164
|
+
|---------|-----|-----|---------|
|
|
165
|
+
| Blue text | `(0,0,255)` | `#0000FF` | Hard-coded inputs / scenario toggles |
|
|
166
|
+
| Black text | `(0,0,0)` | `#000000` | All formulas and calculated values |
|
|
167
|
+
| Green text | `(0,128,0)` | `#008000` | Cross-sheet references within the same workbook |
|
|
168
|
+
| Red text | `(255,0,0)` | `#FF0000` | External links to other files |
|
|
169
|
+
| Yellow fill | `(255,255,0)` | `#FFFF00` | Key assumptions or cells requiring review |
|
|
170
|
+
|
|
171
|
+
### Number Formatting
|
|
172
|
+
|
|
173
|
+
| Data Type | Excel Format Code | Display Example | Notes |
|
|
174
|
+
|-----------|-------------------|-----------------|-------|
|
|
175
|
+
| Calendar years | _(plain text)_ | `2024` | Never `2,024` — no thousands separator |
|
|
176
|
+
| Currency | `$#,##0` | `$1,250` | Always label units in headers, e.g. _Revenue ($mm)_ |
|
|
177
|
+
| Zero values | `$#,##0;($#,##0);-` | `-` | Custom three-section format |
|
|
178
|
+
| Percentages | `0.0%` | `12.5%` | One decimal place |
|
|
179
|
+
| Multiples | `0.0x` | `3.2x` | For EV/EBITDA, P/E, etc. |
|
|
180
|
+
| Negatives | `(#,##0)` | `(123)` | Parenthesised, never `-123` |
|
|
181
|
+
|
|
182
|
+
### Formula Best Practices
|
|
183
|
+
|
|
184
|
+
- **Centralise assumptions** — growth rates, margins, multiples belong in labelled assumption cells; formulas should reference those cells, not embed literals.
|
|
185
|
+
```
|
|
186
|
+
=B5*(1+$B$6) ✓
|
|
187
|
+
=B5*1.05 ✗
|
|
188
|
+
```
|
|
189
|
+
- **Prevent errors** — verify references, check range boundaries, confirm consistent formulas across projection periods, test edge cases.
|
|
190
|
+
- **No circular references** — unless explicitly designed and documented.
|
|
191
|
+
- **Document hard-codes** — add a cell comment or adjacent note:
|
|
192
|
+
`Source: Company 10-K, FY2024, Page 45, Revenue Note, [SEC EDGAR URL]`
|
|
193
|
+
|
|
194
|
+
## Formula Verification Checklist
|
|
195
|
+
|
|
196
|
+
- Spot-check 2–3 references before building the full model.
|
|
197
|
+
- Confirm column mapping (column 64 → BL, not BK).
|
|
198
|
+
- Remember row offsets (DataFrame row 5 = Excel row 6).
|
|
199
|
+
- Guard against `NaN` — use `pd.notna()`.
|
|
200
|
+
- Test far-right columns (FY data often sits in column 50+).
|
|
201
|
+
- Handle multiple matches — search all occurrences, not just the first.
|
|
202
|
+
|
|
203
|
+
### Testing Strategy
|
|
204
|
+
|
|
205
|
+
1. Validate formulas on a small range first.
|
|
206
|
+
2. Verify every referenced cell exists.
|
|
207
|
+
3. Include zero, negative, and very large values.
|
|
208
|
+
|
|
209
|
+
## Code Style
|
|
210
|
+
|
|
211
|
+
When generating Python that manipulates spreadsheets:
|
|
212
|
+
- Keep code concise — no verbose names, no gratuitous comments.
|
|
213
|
+
- Skip unnecessary `print()` calls.
|
|
214
|
+
|
|
215
|
+
For the workbook itself:
|
|
216
|
+
- Comment cells that contain complex formulas or key assumptions.
|
|
217
|
+
- Cite data sources for every hard-coded figure.
|
|
218
|
+
- Add section headers and notes for major model blocks.
|
|
219
|
+
|
|
220
|
+
## Library Tips
|
|
221
|
+
|
|
222
|
+
### openpyxl
|
|
223
|
+
|
|
224
|
+
- Indices are **1-based** — `(row=1, column=1)` is cell A1.
|
|
225
|
+
- `data_only=True` reads cached values; **warning:** saving afterward strips all formulas permanently.
|
|
226
|
+
- For large files: `read_only=True` (reading) or `write_only=True` (writing).
|
|
227
|
+
- Formulas are stored as strings and require `scripts/recalc.py` to populate values.
|
|
228
|
+
|
|
229
|
+
### pandas
|
|
230
|
+
|
|
231
|
+
- Specify dtypes to avoid inference surprises: `pd.read_excel('f.xlsx', dtype={'id': str})`.
|
|
232
|
+
- Limit columns on large files: `usecols=['A', 'C', 'E']`.
|
|
233
|
+
- Parse dates explicitly: `parse_dates=['date_column']`.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# helpers sub-package (intentionally empty)
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# ──────────────────────────────────────────────────────────────────
|
|
3
|
+
# Coalesce adjacent <w:r> elements sharing identical <w:rPr> in DOCX.
|
|
4
|
+
#
|
|
5
|
+
# Pre-processing steps that enable merging:
|
|
6
|
+
# 1. Strip all proofErr elements (spell/grammar markers)
|
|
7
|
+
# 2. Remove rsid* attributes from runs (revision metadata)
|
|
8
|
+
#
|
|
9
|
+
# After merging, adjacent <w:t> children inside the same run are
|
|
10
|
+
# concatenated into a single text node.
|
|
11
|
+
# ──────────────────────────────────────────────────────────────────
|
|
12
|
+
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
import defusedxml.minidom
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def merge_runs(input_dir: str) -> tuple[int, str]:
|
|
19
|
+
"""Entry point – merge runs in word/document.xml and return (count, msg)."""
|
|
20
|
+
doc = Path(input_dir) / "word" / "document.xml"
|
|
21
|
+
|
|
22
|
+
if not doc.exists():
|
|
23
|
+
return 0, "Error: {} not found".format(doc)
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
tree = defusedxml.minidom.parseString(doc.read_text(encoding="utf-8"))
|
|
27
|
+
top = tree.documentElement
|
|
28
|
+
|
|
29
|
+
# housekeeping
|
|
30
|
+
_purge_by_tag(top, "proofErr")
|
|
31
|
+
_drop_rsid_attrs(top)
|
|
32
|
+
|
|
33
|
+
# collect unique parent containers of all <w:r>
|
|
34
|
+
parents = {nd.parentNode for nd in _query_tag(top, "r")}
|
|
35
|
+
|
|
36
|
+
merged = 0
|
|
37
|
+
for p in parents:
|
|
38
|
+
merged += _coalesce_in_container(p)
|
|
39
|
+
|
|
40
|
+
doc.write_bytes(tree.toxml(encoding="UTF-8"))
|
|
41
|
+
return merged, "Merged {} runs".format(merged)
|
|
42
|
+
|
|
43
|
+
except Exception as ex:
|
|
44
|
+
return 0, "Error: {}".format(ex)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# ──────────────────────────────────────────────────────────────────
|
|
48
|
+
# DOM traversal helpers
|
|
49
|
+
# ──────────────────────────────────────────────────────────────────
|
|
50
|
+
|
|
51
|
+
def _query_tag(root, local_name: str) -> list:
|
|
52
|
+
"""Recursively find all elements whose local name matches *local_name*."""
|
|
53
|
+
hits = []
|
|
54
|
+
|
|
55
|
+
def _walk(nd):
|
|
56
|
+
if nd.nodeType != nd.ELEMENT_NODE:
|
|
57
|
+
return
|
|
58
|
+
tag = nd.localName or nd.tagName
|
|
59
|
+
if tag == local_name or tag.endswith(":{}".format(local_name)):
|
|
60
|
+
hits.append(nd)
|
|
61
|
+
for ch in nd.childNodes:
|
|
62
|
+
_walk(ch)
|
|
63
|
+
|
|
64
|
+
_walk(root)
|
|
65
|
+
return hits
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _child_by_tag(parent, local_name: str):
|
|
69
|
+
"""Return the first direct child element matching *local_name*."""
|
|
70
|
+
for ch in parent.childNodes:
|
|
71
|
+
if ch.nodeType != ch.ELEMENT_NODE:
|
|
72
|
+
continue
|
|
73
|
+
tag = ch.localName or ch.tagName
|
|
74
|
+
if tag == local_name or tag.endswith(":{}".format(local_name)):
|
|
75
|
+
return ch
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _children_by_tag(parent, local_name: str) -> list:
|
|
80
|
+
"""Return every direct child element matching *local_name*."""
|
|
81
|
+
return [
|
|
82
|
+
ch for ch in parent.childNodes
|
|
83
|
+
if ch.nodeType == ch.ELEMENT_NODE
|
|
84
|
+
and ((ch.localName or ch.tagName) == local_name
|
|
85
|
+
or (ch.localName or ch.tagName).endswith(":{}".format(local_name)))
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _only_whitespace_between(a, b) -> bool:
|
|
90
|
+
"""True when nothing meaningful sits between siblings *a* and *b*."""
|
|
91
|
+
cur = a.nextSibling
|
|
92
|
+
while cur is not None:
|
|
93
|
+
if cur is b:
|
|
94
|
+
return True
|
|
95
|
+
if cur.nodeType == cur.ELEMENT_NODE:
|
|
96
|
+
return False
|
|
97
|
+
if cur.nodeType == cur.TEXT_NODE and cur.data.strip():
|
|
98
|
+
return False
|
|
99
|
+
cur = cur.nextSibling
|
|
100
|
+
return False
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# ──────────────────────────────────────────────────────────────────
|
|
104
|
+
# Cleanup passes
|
|
105
|
+
# ──────────────────────────────────────────────────────────────────
|
|
106
|
+
|
|
107
|
+
def _purge_by_tag(root, local_name: str):
|
|
108
|
+
"""Remove every element whose local name matches *local_name*."""
|
|
109
|
+
for nd in _query_tag(root, local_name):
|
|
110
|
+
if nd.parentNode is not None:
|
|
111
|
+
nd.parentNode.removeChild(nd)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _drop_rsid_attrs(root):
|
|
115
|
+
"""Strip revision-save-ID attributes from all <w:r> elements."""
|
|
116
|
+
for r in _query_tag(root, "r"):
|
|
117
|
+
doomed = [a for a in r.attributes.values() if "rsid" in a.name.lower()]
|
|
118
|
+
for a in doomed:
|
|
119
|
+
r.removeAttribute(a.name)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# ──────────────────────────────────────────────────────────────────
|
|
123
|
+
# Core merging logic
|
|
124
|
+
# ──────────────────────────────────────────────────────────────────
|
|
125
|
+
|
|
126
|
+
def _tag_is_run(nd) -> bool:
|
|
127
|
+
tag = nd.localName or nd.tagName
|
|
128
|
+
return tag == "r" or tag.endswith(":r")
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _next_elem(nd):
|
|
132
|
+
"""Return the next element sibling (skip text/comment nodes)."""
|
|
133
|
+
s = nd.nextSibling
|
|
134
|
+
while s is not None:
|
|
135
|
+
if s.nodeType == s.ELEMENT_NODE:
|
|
136
|
+
return s
|
|
137
|
+
s = s.nextSibling
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _next_run_sibling(nd):
|
|
142
|
+
"""Walk forward until we hit another <w:r> element."""
|
|
143
|
+
s = nd.nextSibling
|
|
144
|
+
while s is not None:
|
|
145
|
+
if s.nodeType == s.ELEMENT_NODE and _tag_is_run(s):
|
|
146
|
+
return s
|
|
147
|
+
s = s.nextSibling
|
|
148
|
+
return None
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _first_run_child(container):
|
|
152
|
+
"""Return the first child that is a run element."""
|
|
153
|
+
for ch in container.childNodes:
|
|
154
|
+
if ch.nodeType == ch.ELEMENT_NODE and _tag_is_run(ch):
|
|
155
|
+
return ch
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _runs_compatible(a, b) -> bool:
|
|
160
|
+
"""Two runs are compatible when their <w:rPr> serialisations match."""
|
|
161
|
+
rpr_a = _child_by_tag(a, "rPr")
|
|
162
|
+
rpr_b = _child_by_tag(b, "rPr")
|
|
163
|
+
if (rpr_a is None) != (rpr_b is None):
|
|
164
|
+
return False
|
|
165
|
+
return True if rpr_a is None else rpr_a.toxml() == rpr_b.toxml()
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _absorb_run(dst, src):
|
|
169
|
+
"""Move non-rPr children from *src* into *dst*."""
|
|
170
|
+
for ch in list(src.childNodes):
|
|
171
|
+
if ch.nodeType != ch.ELEMENT_NODE:
|
|
172
|
+
continue
|
|
173
|
+
tag = ch.localName or ch.tagName
|
|
174
|
+
if tag == "rPr" or tag.endswith(":rPr"):
|
|
175
|
+
continue
|
|
176
|
+
dst.appendChild(ch)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _squash_text_nodes(run):
|
|
180
|
+
"""Concatenate adjacent <w:t> children into one."""
|
|
181
|
+
t_nodes = _children_by_tag(run, "t")
|
|
182
|
+
|
|
183
|
+
idx = len(t_nodes) - 1
|
|
184
|
+
while idx > 0:
|
|
185
|
+
cur, prev = t_nodes[idx], t_nodes[idx - 1]
|
|
186
|
+
|
|
187
|
+
if _only_whitespace_between(prev, cur):
|
|
188
|
+
txt_prev = prev.firstChild.data if prev.firstChild else ""
|
|
189
|
+
txt_cur = cur.firstChild.data if cur.firstChild else ""
|
|
190
|
+
combined = txt_prev + txt_cur
|
|
191
|
+
|
|
192
|
+
if prev.firstChild:
|
|
193
|
+
prev.firstChild.data = combined
|
|
194
|
+
else:
|
|
195
|
+
prev.appendChild(run.ownerDocument.createTextNode(combined))
|
|
196
|
+
|
|
197
|
+
if combined.startswith(" ") or combined.endswith(" "):
|
|
198
|
+
prev.setAttribute("xml:space", "preserve")
|
|
199
|
+
elif prev.hasAttribute("xml:space"):
|
|
200
|
+
prev.removeAttribute("xml:space")
|
|
201
|
+
|
|
202
|
+
run.removeChild(cur)
|
|
203
|
+
|
|
204
|
+
idx -= 1
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _coalesce_in_container(container) -> int:
|
|
208
|
+
"""Merge compatible adjacent runs inside *container*."""
|
|
209
|
+
count = 0
|
|
210
|
+
cur = _first_run_child(container)
|
|
211
|
+
|
|
212
|
+
while cur is not None:
|
|
213
|
+
# absorb as many consecutive compatible runs as possible
|
|
214
|
+
while True:
|
|
215
|
+
nxt = _next_elem(cur)
|
|
216
|
+
if nxt is not None and _tag_is_run(nxt) and _runs_compatible(cur, nxt):
|
|
217
|
+
_absorb_run(cur, nxt)
|
|
218
|
+
container.removeChild(nxt)
|
|
219
|
+
count += 1
|
|
220
|
+
else:
|
|
221
|
+
break
|
|
222
|
+
|
|
223
|
+
_squash_text_nodes(cur)
|
|
224
|
+
cur = _next_run_sibling(cur)
|
|
225
|
+
|
|
226
|
+
return count
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# ──────────────────────────────────────────────────────────────────
|
|
3
|
+
# Merge adjacent tracked-change wrappers (<w:ins> / <w:del>) when they
|
|
4
|
+
# share the same author. Reduces visual clutter in heavily-redlined
|
|
5
|
+
# DOCX documents without altering semantics.
|
|
6
|
+
#
|
|
7
|
+
# Constraints:
|
|
8
|
+
# • Only merges elements of the *same* tag (ins↔ins, del↔del)
|
|
9
|
+
# • Author must match (timestamps are ignored)
|
|
10
|
+
# • Only merges truly adjacent elements (whitespace-only gap allowed)
|
|
11
|
+
# ──────────────────────────────────────────────────────────────────
|
|
12
|
+
|
|
13
|
+
import xml.etree.ElementTree as ET
|
|
14
|
+
import zipfile
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
import defusedxml.minidom
|
|
18
|
+
|
|
19
|
+
_WML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def simplify_redlines(input_dir: str) -> tuple[int, str]:
|
|
23
|
+
"""Merge adjacent tracked changes in word/document.xml."""
|
|
24
|
+
doc = Path(input_dir) / "word" / "document.xml"
|
|
25
|
+
|
|
26
|
+
if not doc.exists():
|
|
27
|
+
return 0, "Error: {} not found".format(doc)
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
tree = defusedxml.minidom.parseString(doc.read_text(encoding="utf-8"))
|
|
31
|
+
top = tree.documentElement
|
|
32
|
+
|
|
33
|
+
total = 0
|
|
34
|
+
for box in _collect_elements(top, "p") + _collect_elements(top, "tc"):
|
|
35
|
+
total += _coalesce_tracked(box, "ins")
|
|
36
|
+
total += _coalesce_tracked(box, "del")
|
|
37
|
+
|
|
38
|
+
doc.write_bytes(tree.toxml(encoding="UTF-8"))
|
|
39
|
+
return total, "Simplified {} tracked changes".format(total)
|
|
40
|
+
|
|
41
|
+
except Exception as ex:
|
|
42
|
+
return 0, "Error: {}".format(ex)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# ──────────────────────────────────────────────────────────────────
|
|
46
|
+
# Internal helpers
|
|
47
|
+
# ──────────────────────────────────────────────────────────────────
|
|
48
|
+
|
|
49
|
+
def _coalesce_tracked(container, kind: str) -> int:
|
|
50
|
+
"""Merge adjacent <w:ins> or <w:del> elements inside *container*."""
|
|
51
|
+
nodes = [
|
|
52
|
+
ch for ch in container.childNodes
|
|
53
|
+
if ch.nodeType == ch.ELEMENT_NODE and _matches_tag(ch, kind)
|
|
54
|
+
]
|
|
55
|
+
if len(nodes) < 2:
|
|
56
|
+
return 0
|
|
57
|
+
|
|
58
|
+
merged = 0
|
|
59
|
+
pos = 0
|
|
60
|
+
while pos < len(nodes) - 1:
|
|
61
|
+
this, that = nodes[pos], nodes[pos + 1]
|
|
62
|
+
if _same_author_adjacent(this, that):
|
|
63
|
+
_move_children(this, that)
|
|
64
|
+
container.removeChild(that)
|
|
65
|
+
nodes.pop(pos + 1)
|
|
66
|
+
merged += 1
|
|
67
|
+
else:
|
|
68
|
+
pos += 1
|
|
69
|
+
|
|
70
|
+
return merged
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _matches_tag(nd, local: str) -> bool:
|
|
74
|
+
tag = nd.localName or nd.tagName
|
|
75
|
+
return tag == local or tag.endswith(":{}".format(local))
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _author_of(elem) -> str:
|
|
79
|
+
"""Extract the w:author attribute value from a tracked-change element."""
|
|
80
|
+
val = elem.getAttribute("w:author")
|
|
81
|
+
if val:
|
|
82
|
+
return val
|
|
83
|
+
for attr in elem.attributes.values():
|
|
84
|
+
if attr.localName == "author" or attr.name.endswith(":author"):
|
|
85
|
+
return attr.value
|
|
86
|
+
return ""
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _same_author_adjacent(a, b) -> bool:
|
|
90
|
+
"""True when *a* and *b* share an author and have no elements between them."""
|
|
91
|
+
if _author_of(a) != _author_of(b):
|
|
92
|
+
return False
|
|
93
|
+
cur = a.nextSibling
|
|
94
|
+
while cur is not None and cur is not b:
|
|
95
|
+
if cur.nodeType == cur.ELEMENT_NODE:
|
|
96
|
+
return False
|
|
97
|
+
if cur.nodeType == cur.TEXT_NODE and cur.data.strip():
|
|
98
|
+
return False
|
|
99
|
+
cur = cur.nextSibling
|
|
100
|
+
return True
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _move_children(dst, src):
|
|
104
|
+
"""Transplant every child of *src* into *dst*."""
|
|
105
|
+
while src.firstChild is not None:
|
|
106
|
+
kid = src.firstChild
|
|
107
|
+
src.removeChild(kid)
|
|
108
|
+
dst.appendChild(kid)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _collect_elements(root, local: str) -> list:
|
|
112
|
+
"""Depth-first collection of elements matching *local*."""
|
|
113
|
+
out = []
|
|
114
|
+
|
|
115
|
+
def _dfs(nd):
|
|
116
|
+
if nd.nodeType == nd.ELEMENT_NODE:
|
|
117
|
+
tag = nd.localName or nd.tagName
|
|
118
|
+
if tag == local or tag.endswith(":{}".format(local)):
|
|
119
|
+
out.append(nd)
|
|
120
|
+
for ch in nd.childNodes:
|
|
121
|
+
_dfs(ch)
|
|
122
|
+
|
|
123
|
+
_dfs(root)
|
|
124
|
+
return out
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
# ──────────────────────────────────────────────────────────────────
|
|
128
|
+
# Author-analysis utilities (used by pack.py / infer_author)
|
|
129
|
+
# ──────────────────────────────────────────────────────────────────
|
|
130
|
+
|
|
131
|
+
def get_tracked_change_authors(xml_path: Path) -> dict[str, int]:
|
|
132
|
+
"""Count tracked-change occurrences per author in an XML file."""
|
|
133
|
+
if not xml_path.exists():
|
|
134
|
+
return {}
|
|
135
|
+
try:
|
|
136
|
+
parsed = ET.parse(xml_path)
|
|
137
|
+
except ET.ParseError:
|
|
138
|
+
return {}
|
|
139
|
+
|
|
140
|
+
ns = {"w": _WML_NS}
|
|
141
|
+
attr_key = "{{{}}}author".format(_WML_NS)
|
|
142
|
+
|
|
143
|
+
counts: dict[str, int] = {}
|
|
144
|
+
for tag in ("ins", "del"):
|
|
145
|
+
for el in parsed.getroot().findall(".//w:{}".format(tag), ns):
|
|
146
|
+
who = el.get(attr_key)
|
|
147
|
+
if who:
|
|
148
|
+
counts[who] = counts.get(who, 0) + 1
|
|
149
|
+
return counts
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _get_authors_from_docx(docx: Path) -> dict[str, int]:
|
|
153
|
+
"""Extract author counts from a packed .docx without full unpacking."""
|
|
154
|
+
try:
|
|
155
|
+
with zipfile.ZipFile(docx, "r") as zf:
|
|
156
|
+
if "word/document.xml" not in zf.namelist():
|
|
157
|
+
return {}
|
|
158
|
+
with zf.open("word/document.xml") as fh:
|
|
159
|
+
parsed = ET.parse(fh)
|
|
160
|
+
ns = {"w": _WML_NS}
|
|
161
|
+
attr_key = "{{{}}}author".format(_WML_NS)
|
|
162
|
+
counts: dict[str, int] = {}
|
|
163
|
+
for tag in ("ins", "del"):
|
|
164
|
+
for el in parsed.getroot().findall(".//w:{}".format(tag), ns):
|
|
165
|
+
who = el.get(attr_key)
|
|
166
|
+
if who:
|
|
167
|
+
counts[who] = counts.get(who, 0) + 1
|
|
168
|
+
return counts
|
|
169
|
+
except (zipfile.BadZipFile, ET.ParseError):
|
|
170
|
+
return {}
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def infer_author(modified_dir: Path, original_docx: Path, default: str = "Claude") -> str:
|
|
174
|
+
"""Guess which author introduced new tracked changes."""
|
|
175
|
+
mod_xml = modified_dir / "word" / "document.xml"
|
|
176
|
+
mod_counts = get_tracked_change_authors(mod_xml)
|
|
177
|
+
|
|
178
|
+
if not mod_counts:
|
|
179
|
+
return default
|
|
180
|
+
|
|
181
|
+
orig_counts = _get_authors_from_docx(original_docx)
|
|
182
|
+
|
|
183
|
+
delta: dict[str, int] = {}
|
|
184
|
+
for who, n in mod_counts.items():
|
|
185
|
+
diff = n - orig_counts.get(who, 0)
|
|
186
|
+
if diff > 0:
|
|
187
|
+
delta[who] = diff
|
|
188
|
+
|
|
189
|
+
if not delta:
|
|
190
|
+
return default
|
|
191
|
+
|
|
192
|
+
if len(delta) == 1:
|
|
193
|
+
return next(iter(delta))
|
|
194
|
+
|
|
195
|
+
raise ValueError(
|
|
196
|
+
"Multiple authors added new changes: {}. "
|
|
197
|
+
"Cannot infer which author to validate.".format(delta)
|
|
198
|
+
)
|