adeu 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
adeu-0.4.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 dealfluence
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
adeu-0.4.0/PKG-INFO ADDED
@@ -0,0 +1,109 @@
1
+ Metadata-Version: 2.4
2
+ Name: adeu
3
+ Version: 0.4.0
4
+ Summary: Automated DOCX Redlining Engine
5
+ License-File: LICENSE
6
+ Author: Mikko Korpela
7
+ Requires-Python: >=3.12
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Classifier: Programming Language :: Python :: 3.13
11
+ Classifier: Programming Language :: Python :: 3.14
12
+ Requires-Dist: diff-match-patch (>=20230430)
13
+ Requires-Dist: lxml (>=5.0.0)
14
+ Requires-Dist: mcp (>=1.2.0)
15
+ Requires-Dist: pydantic (>=2.0.0)
16
+ Requires-Dist: python-docx (>=1.1.0)
17
+ Requires-Dist: structlog (>=24.0.0)
18
+ Description-Content-Type: text/markdown
19
+
20
+ # Adeu: AI Redlining Engine
21
+
22
+ **Adeu allows AI Agents and LLMs to "Track Changes" in Microsoft Word documents.**
23
+
24
+ Most LLMs output raw text or Markdown. Professionals need `w:ins` (insertions) and `w:del` (deletions) to review changes inside Word. `adeu` lib shows a Word document in an LLM and human understandable textual format and reflects changes made to it to the actual word document.
25
+
26
+ It creates a "Virtual DOM" of your document, letting AI apply surgical edits without breaking your formatting, numbering, or headers.
27
+
28
+ ---
29
+
30
+ ## Installation
31
+
32
+ Adeu is available on PyPI.
33
+
34
+ ```bash
35
+ pip install adeu
36
+ ```
37
+
38
+ ---
39
+
40
+ ## Ways to Use Adeu
41
+
42
+ ### 1. As MCP Server (No Code Required)
43
+ If you use an agentic system such as Claude Desktop, you can connect Adeu directly. This lets you handle contracts in Claude and say: *"Change the Governing Law to Delaware and generate me the redline."*
44
+
45
+ Add this to your `claude_desktop_config.json`:
46
+
47
+ ```json
48
+ {
49
+ "mcpServers": {
50
+ "adeu": {
51
+ "command": "uvx",
52
+ "args": ["adeu", "adeu-server"]
53
+ }
54
+ }
55
+ }
56
+ ```
57
+ *(Requires [uv](https://docs.astral.sh/uv/) installed on your machine)*
58
+
59
+ ### 2. For "Vibe Coding" & Python Scripts
60
+ Building your own Agentic AI tool in Cursor, Replit, or Windsurf: Adeu is the engine that handles the document manipulation for you.
61
+
62
+ ```python
63
+ from adeu import RedlineEngine, DocumentEdit
64
+ from io import BytesIO
65
+
66
+ # 1. Load your contract
67
+ with open("NDA.docx", "rb") as f:
68
+ doc_stream = BytesIO(f.read())
69
+
70
+ # 2. Define the change (Usually this comes from your LLM response)
71
+ edit = DocumentEdit(
72
+ target_text="State of New York",
73
+ new_text="State of Delaware",
74
+ comment="Changed governing law to neutral jurisdiction."
75
+ )
76
+
77
+ # 3. Apply the Redline
78
+ engine = RedlineEngine(doc_stream, author="AI Associate")
79
+ engine.apply_edits([edit])
80
+
81
+ # 4. Save
82
+ with open("NDA_Redlined.docx", "wb") as f:
83
+ f.write(engine.save_to_stream().getvalue())
84
+ ```
85
+
86
+ ### 3. The CLI
87
+ Quickly extract text or apply patches from your terminal.
88
+
89
+ ```bash
90
+ # Compare two docs and see a summary
91
+ adeu diff v1.docx v2.docx
92
+
93
+ # Apply a JSON list of edits to a doc
94
+ adeu apply agreement.docx edits.json
95
+ ```
96
+
97
+ ---
98
+
99
+ ## Why Adeu?
100
+
101
+ * **Native Redlines**: Generates real Microsoft Word Track Changes. You can "Accept" or "Reject" them in Word.
102
+ * **Format Safe**: Adeu preserves your complex numbering, headers, footers, and images. It only touches the text you change.
103
+ * **Native Comments**: Supports adding comments (`Review Pane`) linked to specific text ranges.
104
+ * **Intelligent Mapping**: Handles the messy internal XML of Word documents (e.g., when "Contract" is split into `["Con", "tract"]` by spellcheck).
105
+
106
+ ## License
107
+
108
+ MIT License. Open source and free to use in commercial legal tech applications.
109
+
adeu-0.4.0/README.md ADDED
@@ -0,0 +1,89 @@
1
+ # Adeu: AI Redlining Engine
2
+
3
+ **Adeu allows AI Agents and LLMs to "Track Changes" in Microsoft Word documents.**
4
+
5
+ Most LLMs output raw text or Markdown. Professionals need `w:ins` (insertions) and `w:del` (deletions) to review changes inside Word. `adeu` lib shows a Word document in an LLM and human understandable textual format and reflects changes made to it to the actual word document.
6
+
7
+ It creates a "Virtual DOM" of your document, letting AI apply surgical edits without breaking your formatting, numbering, or headers.
8
+
9
+ ---
10
+
11
+ ## Installation
12
+
13
+ Adeu is available on PyPI.
14
+
15
+ ```bash
16
+ pip install adeu
17
+ ```
18
+
19
+ ---
20
+
21
+ ## Ways to Use Adeu
22
+
23
+ ### 1. As MCP Server (No Code Required)
24
+ If you use an agentic system such as Claude Desktop, you can connect Adeu directly. This lets you handle contracts in Claude and say: *"Change the Governing Law to Delaware and generate me the redline."*
25
+
26
+ Add this to your `claude_desktop_config.json`:
27
+
28
+ ```json
29
+ {
30
+ "mcpServers": {
31
+ "adeu": {
32
+ "command": "uvx",
33
+ "args": ["adeu", "adeu-server"]
34
+ }
35
+ }
36
+ }
37
+ ```
38
+ *(Requires [uv](https://docs.astral.sh/uv/) installed on your machine)*
39
+
40
+ ### 2. For "Vibe Coding" & Python Scripts
41
+ Building your own Agentic AI tool in Cursor, Replit, or Windsurf: Adeu is the engine that handles the document manipulation for you.
42
+
43
+ ```python
44
+ from adeu import RedlineEngine, DocumentEdit
45
+ from io import BytesIO
46
+
47
+ # 1. Load your contract
48
+ with open("NDA.docx", "rb") as f:
49
+ doc_stream = BytesIO(f.read())
50
+
51
+ # 2. Define the change (Usually this comes from your LLM response)
52
+ edit = DocumentEdit(
53
+ target_text="State of New York",
54
+ new_text="State of Delaware",
55
+ comment="Changed governing law to neutral jurisdiction."
56
+ )
57
+
58
+ # 3. Apply the Redline
59
+ engine = RedlineEngine(doc_stream, author="AI Associate")
60
+ engine.apply_edits([edit])
61
+
62
+ # 4. Save
63
+ with open("NDA_Redlined.docx", "wb") as f:
64
+ f.write(engine.save_to_stream().getvalue())
65
+ ```
66
+
67
+ ### 3. The CLI
68
+ Quickly extract text or apply patches from your terminal.
69
+
70
+ ```bash
71
+ # Compare two docs and see a summary
72
+ adeu diff v1.docx v2.docx
73
+
74
+ # Apply a JSON list of edits to a doc
75
+ adeu apply agreement.docx edits.json
76
+ ```
77
+
78
+ ---
79
+
80
+ ## Why Adeu?
81
+
82
+ * **Native Redlines**: Generates real Microsoft Word Track Changes. You can "Accept" or "Reject" them in Word.
83
+ * **Format Safe**: Adeu preserves your complex numbering, headers, footers, and images. It only touches the text you change.
84
+ * **Native Comments**: Supports adding comments (`Review Pane`) linked to specific text ranges.
85
+ * **Intelligent Mapping**: Handles the messy internal XML of Word documents (e.g., when "Contract" is split into `["Con", "tract"]` by spellcheck).
86
+
87
+ ## License
88
+
89
+ MIT License. Open source and free to use in commercial legal tech applications.
@@ -0,0 +1,52 @@
1
+ [tool.poetry]
2
+ name = "adeu"
3
+ version = "0.4.0"
4
+ description = "Automated DOCX Redlining Engine"
5
+ authors = ["Mikko Korpela"]
6
+ readme = "README.md"
7
+ packages = [{include = "adeu", from = "src"}]
8
+
9
+ [tool.poetry.scripts]
10
+ adeu = "adeu.cli:main"
11
+ adeu-server = "adeu.server:main"
12
+
13
+ [tool.poetry.dependencies]
14
+ python = ">=3.12"
15
+ python-docx = ">=1.1.0"
16
+ structlog = ">=24.0.0"
17
+ pydantic = ">=2.0.0"
18
+ lxml = ">=5.0.0"
19
+ diff-match-patch = ">=20230430"
20
+ mcp = ">=1.2.0"
21
+
22
+ [tool.poetry.group.dev.dependencies]
23
+ pytest = "*"
24
+ ruff = "*"
25
+ mypy = "*"
26
+ hypothesis = "*"
27
+
28
+ [build-system]
29
+ requires = ["poetry-core"]
30
+ build-backend = "poetry.core.masonry.api"
31
+
32
+ [tool.ruff]
33
+ line-length = 120
34
+ target-version = "py310"
35
+
36
+ [tool.ruff.lint]
37
+ select = ["E", "F", "I", "B", "W"]
38
+ ignore = []
39
+
40
+ [tool.mypy]
41
+ python_version = "3.12"
42
+ strict = false
43
+ ignore_missing_imports = true
44
+ check_untyped_defs = true
45
+
46
+ [[tool.mypy.overrides]]
47
+ module = "diff_match_patch.*"
48
+ ignore_missing_imports = true
49
+
50
+ [tool.pytest.ini_options]
51
+ testpaths = ["tests"]
52
+ python_files = "test_*.py"
@@ -0,0 +1,9 @@
1
+ from importlib.metadata import version
2
+
3
+ from adeu.ingest import extract_text_from_stream
4
+ from adeu.models import DocumentEdit
5
+ from adeu.redline.engine import RedlineEngine
6
+
7
+ __version__ = version("adeu")
8
+
9
+ __all__ = ["RedlineEngine", "DocumentEdit", "extract_text_from_stream", "__version__"]
@@ -0,0 +1,147 @@
1
+ import argparse
2
+ import getpass
3
+ import json
4
+ import sys
5
+ from io import BytesIO
6
+ from pathlib import Path
7
+ from typing import List
8
+
9
+ from adeu import __version__
10
+ from adeu.diff import generate_edits_from_text
11
+ from adeu.ingest import extract_text_from_stream
12
+ from adeu.models import DocumentEdit
13
+ from adeu.redline.engine import RedlineEngine
14
+
15
+
16
+ def _read_docx_text(path: Path) -> str:
17
+ if not path.exists():
18
+ print(f"Error: File not found: {path}", file=sys.stderr)
19
+ sys.exit(1)
20
+ with open(path, "rb") as f:
21
+ return extract_text_from_stream(BytesIO(f.read()), filename=path.name)
22
+
23
+
24
+ def _load_edits_from_json(path: Path) -> List[DocumentEdit]:
25
+ try:
26
+ with open(path, "r", encoding="utf-8") as f:
27
+ data = json.load(f)
28
+ edits = []
29
+ for item in data:
30
+ target = item.get("target_text") or item.get("original")
31
+ new_val = item.get("new_text") or item.get("replace")
32
+ comment = item.get("comment")
33
+
34
+ edits.append(DocumentEdit(target_text=target or "", new_text=new_val or "", comment=comment))
35
+ return edits
36
+ except Exception as e:
37
+ print(f"Error parsing JSON edits: {e}", file=sys.stderr)
38
+ sys.exit(1)
39
+
40
+
41
+ def handle_extract(args):
42
+ text = _read_docx_text(args.input)
43
+ if args.output:
44
+ with open(args.output, "w", encoding="utf-8") as f:
45
+ f.write(text)
46
+ print(f"Extracted text to {args.output}", file=sys.stderr)
47
+ else:
48
+ print(text)
49
+
50
+
51
+ def handle_diff(args):
52
+ text_orig = _read_docx_text(args.original)
53
+
54
+ if args.modified.suffix == ".docx":
55
+ text_mod = _read_docx_text(args.modified)
56
+ else:
57
+ with open(args.modified, "r", encoding="utf-8") as f:
58
+ text_mod = f.read()
59
+
60
+ edits = generate_edits_from_text(text_orig, text_mod)
61
+
62
+ if args.json:
63
+ output = [e.model_dump(exclude={"_match_start_index"}) for e in edits]
64
+ print(json.dumps(output, indent=2))
65
+ else:
66
+ print(f"Found {len(edits)} changes:", file=sys.stderr)
67
+ for e in edits:
68
+ if not e.new_text:
69
+ print(f"[-] {e.target_text}")
70
+ elif not e.target_text:
71
+ print(f"[+] {e.new_text}")
72
+ else:
73
+ print(f"[~] '{e.target_text}' -> '{e.new_text}'")
74
+
75
+
76
+ def handle_apply(args):
77
+ edits = []
78
+ if args.changes.suffix.lower() == ".json":
79
+ print(f"Loading structured edits from {args.changes}...", file=sys.stderr)
80
+ edits = _load_edits_from_json(args.changes)
81
+ else:
82
+ print(f"Calculating diff from text file {args.changes}...", file=sys.stderr)
83
+ text_orig = _read_docx_text(args.original)
84
+ with open(args.changes, "r", encoding="utf-8") as f:
85
+ text_mod = f.read()
86
+ edits = generate_edits_from_text(text_orig, text_mod)
87
+
88
+ print(f"Applying {len(edits)} edits...", file=sys.stderr)
89
+
90
+ with open(args.original, "rb") as f:
91
+ stream = BytesIO(f.read())
92
+
93
+ engine = RedlineEngine(stream, author=args.author)
94
+ applied, skipped = engine.apply_edits(edits)
95
+
96
+ output_path = args.output
97
+ if not output_path:
98
+ output_path = args.original.with_name(f"{args.original.stem}_redlined.docx")
99
+
100
+ with open(output_path, "wb") as f:
101
+ f.write(engine.save_to_stream().getvalue())
102
+
103
+ print(f"✅ Saved to {output_path}", file=sys.stderr)
104
+ print(f"Stats: {applied} applied, {skipped} skipped.", file=sys.stderr)
105
+ if skipped > 0:
106
+ sys.exit(1)
107
+
108
+
109
+ def main():
110
+ parser = argparse.ArgumentParser(prog="adeu", description="Adeu: Agentic DOCX Redlining Engine")
111
+ parser.add_argument("-v", "--version", action="version", version=f"%(prog)s {__version__}")
112
+ subparsers = parser.add_subparsers(dest="command", required=True, help="Subcommands")
113
+
114
+ p_extract = subparsers.add_parser("extract", help="Extract raw text from a DOCX file")
115
+ p_extract.add_argument("input", type=Path, help="Input DOCX file")
116
+ p_extract.add_argument("-o", "--output", type=Path, help="Output file (default: stdout)")
117
+ p_extract.set_defaults(func=handle_extract)
118
+
119
+ p_diff = subparsers.add_parser("diff", help="Compare two files (DOCX vs DOCX/Text)")
120
+ p_diff.add_argument("original", type=Path, help="Original DOCX")
121
+ p_diff.add_argument("modified", type=Path, help="Modified DOCX or Text file")
122
+ p_diff.add_argument("--json", action="store_true", help="Output raw JSON edits")
123
+ p_diff.set_defaults(func=handle_diff)
124
+
125
+ try:
126
+ default_author = getpass.getuser()
127
+ except Exception:
128
+ default_author = "Adeu AI"
129
+
130
+ p_apply = subparsers.add_parser("apply", help="Apply edits to a DOCX")
131
+ p_apply.add_argument("original", type=Path, help="Original DOCX")
132
+ p_apply.add_argument("changes", type=Path, help="JSON edits file OR Modified Text file")
133
+ p_apply.add_argument("-o", "--output", type=Path, help="Output DOCX path")
134
+ p_apply.add_argument(
135
+ "--author",
136
+ type=str,
137
+ default=default_author,
138
+ help=f"Author name for Track Changes (default: '{default_author}')",
139
+ )
140
+ p_apply.set_defaults(func=handle_apply)
141
+
142
+ args = parser.parse_args()
143
+ args.func(args)
144
+
145
+
146
+ if __name__ == "__main__":
147
+ main()
@@ -0,0 +1,137 @@
1
+ import re
2
+ from typing import Dict, List, Tuple
3
+
4
+ import structlog
5
+ from diff_match_patch import diff_match_patch
6
+
7
+ from adeu.models import DocumentEdit
8
+
9
+ logger = structlog.get_logger(__name__)
10
+
11
+
12
+ def generate_edits_from_text(original_text: str, modified_text: str) -> List[DocumentEdit]:
13
+ """
14
+ Compares original and modified text to generate structured ComplianceEdit objects.
15
+ Uses Word-Level diffing to ensure natural, readable redlines.
16
+ """
17
+ dmp = diff_match_patch()
18
+
19
+ # 1. Word-Level Tokenization & Encoding
20
+ chars1, chars2, token_array = _words_to_chars(original_text, modified_text)
21
+
22
+ # 2. Compute Diff on the Encoded Strings
23
+ diffs_encoded = dmp.diff_main(chars1, chars2, False)
24
+
25
+ # 3. Semantic Cleanup
26
+ dmp.diff_cleanupSemantic(diffs_encoded)
27
+
28
+ # 4. Decode back to Text
29
+ dmp.diff_charsToLines(diffs_encoded, token_array)
30
+ diffs = diffs_encoded
31
+
32
+ edits = []
33
+ current_original_index = 0
34
+ pending_delete = None # Tuple(index, text)
35
+
36
+ for i, (op, text) in enumerate(diffs):
37
+ if op == 0: # Equal
38
+ # Flush pending delete if any
39
+ if pending_delete:
40
+ idx, del_txt = pending_delete
41
+ edit = DocumentEdit(target_text=del_txt, new_text="", comment="Diff: Text deleted")
42
+ edit._match_start_index = idx
43
+ edits.append(edit)
44
+ pending_delete = None
45
+
46
+ current_original_index += len(text)
47
+
48
+ elif op == -1: # Delete
49
+ # Defer deletion to check for immediate insertion (Modification)
50
+ pending_delete = (current_original_index, text)
51
+ current_original_index += len(text)
52
+
53
+ elif op == 1: # Insert
54
+ if pending_delete:
55
+ # Merge into Modification (Replace)
56
+ idx, del_txt = pending_delete
57
+ edit = DocumentEdit(target_text=del_txt, new_text=text, comment="Diff: Replacement")
58
+ edit._match_start_index = idx
59
+ edits.append(edit)
60
+ pending_delete = None
61
+ else:
62
+ # Pure Insertion
63
+ # Find Anchor context
64
+ anchor_start = max(0, current_original_index - 50)
65
+ anchor = original_text[anchor_start:current_original_index]
66
+
67
+ # Special Case: Start-of-Document with no anchor
68
+ if not anchor and current_original_index == 0:
69
+ # Check next equal for context (Forward Anchor)
70
+ if i + 1 < len(diffs) and diffs[i + 1][0] == 0:
71
+ next_text = diffs[i + 1][1]
72
+ # Grab first word or chunk
73
+ anchor_target = next_text.split(" ")[0] if " " in next_text else next_text[:20]
74
+ if anchor_target:
75
+ # Convert to Modification of the following text
76
+ # Target: "Contract" -> New: "Big Contract"
77
+ logger.info(f"Converting start-of-doc insert to modification of '{anchor_target}'")
78
+
79
+ edit = DocumentEdit(
80
+ target_text=anchor_target,
81
+ new_text=text + anchor_target,
82
+ comment="Diff: Start-of-doc insertion",
83
+ )
84
+ edit._match_start_index = current_original_index
85
+ edits.append(edit)
86
+
87
+ # We consumed the start of the next text conceptually?
88
+ # Actually, DMP will process the next Equal text normally.
89
+ # But we claim we modified it. This is slightly overlapping logic.
90
+ # However, since we track indices, we just want to ensure we target correctly.
91
+ # BUT, current_original_index matches the start of anchor_target.
92
+ # So we assume the next Op=0 will advance past it.
93
+ # This is a bit hacky. For now, let's stick to standard Anchor logic if possible,
94
+ # or just use empty anchor if allowed.
95
+
96
+ # Let's revert to simple Anchor logic for stability in this patch.
97
+ pass
98
+
99
+ # Standard Insertion: Target=Anchor, New=Anchor+Text
100
+ edit = DocumentEdit(target_text=anchor, new_text=anchor + text, comment="Diff: Text inserted")
101
+ edit._match_start_index = current_original_index
102
+ edits.append(edit)
103
+
104
+ # Flush trailing delete
105
+ if pending_delete:
106
+ idx, del_txt = pending_delete
107
+ edit = DocumentEdit(target_text=del_txt, new_text="", comment="Diff: Text deleted")
108
+ edit._match_start_index = idx
109
+ edits.append(edit)
110
+
111
+ return edits
112
+
113
+
114
+ def _words_to_chars(text1: str, text2: str) -> Tuple[str, str, List[str]]:
115
+ """
116
+ Splits text into words/tokens and encodes them as unique Unicode characters.
117
+ """
118
+ token_array: List[str] = []
119
+ token_hash: Dict[str, int] = {}
120
+ split_pattern = r"(\s+|\w+|[^\w\s])"
121
+
122
+ def encode_text(text: str) -> str:
123
+ tokens = [t for t in re.split(split_pattern, text) if t]
124
+ encoded_chars = []
125
+ for token in tokens:
126
+ if token in token_hash:
127
+ encoded_chars.append(chr(token_hash[token]))
128
+ else:
129
+ code = len(token_array)
130
+ token_hash[token] = code
131
+ token_array.append(token)
132
+ encoded_chars.append(chr(code))
133
+ return "".join(encoded_chars)
134
+
135
+ chars1 = encode_text(text1)
136
+ chars2 = encode_text(text2)
137
+ return chars1, chars2, token_array
@@ -0,0 +1,85 @@
1
+ # FILE: src/adeu/ingest.py
2
+
3
+ import io
4
+
5
+ import structlog
6
+ from docx import Document
7
+
8
+ from adeu.utils.docx import (
9
+ get_paragraph_prefix,
10
+ get_run_style_markers,
11
+ get_run_text,
12
+ get_visible_runs,
13
+ iter_document_parts,
14
+ )
15
+
16
+ logger = structlog.get_logger(__name__)
17
+
18
+
19
+ def extract_text_from_stream(file_stream: io.BytesIO, filename: str = "document.docx") -> str:
20
+ """
21
+ Extracts text from a file stream using raw run concatenation.
22
+
23
+ CRITICAL: This must match DocumentMapper._build_map logic exactly.
24
+ We iterate runs and join them. We do not use para.text.
25
+ """
26
+ try:
27
+ # Ensure stream is at start
28
+ file_stream.seek(0)
29
+
30
+ doc = Document(file_stream)
31
+ full_text = []
32
+
33
+ for part in iter_document_parts(doc):
34
+ # 1. Paragraphs
35
+ for para in part.paragraphs:
36
+ # Use the visible runs helper to see <w:ins> content
37
+ runs = get_visible_runs(para)
38
+
39
+ # Build paragraph text with markers
40
+ p_text_parts = []
41
+ for r in runs:
42
+ prefix, suffix = get_run_style_markers(r)
43
+ text = get_run_text(r)
44
+ p_text_parts.append(f"{prefix}{text}{suffix}")
45
+
46
+ p_text = "".join(p_text_parts)
47
+
48
+ # Add Markdown prefix if heading
49
+ prefix = get_paragraph_prefix(para)
50
+ full_text.append(prefix + p_text)
51
+
52
+ # 2. Tables
53
+ for table in part.tables:
54
+ for row in table.rows:
55
+ row_parts = []
56
+ for cell in row.cells:
57
+ # Cell paragraphs
58
+ cell_text_parts = []
59
+ for p in cell.paragraphs:
60
+ # Note: We probably don't want headers inside tables usually,
61
+ # but for consistency we should allow it if styled.
62
+ prefix = get_paragraph_prefix(p)
63
+
64
+ runs = get_visible_runs(p)
65
+ p_content_list = []
66
+ for r in runs:
67
+ r_pre, r_suf = get_run_style_markers(r)
68
+ r_text = get_run_text(r)
69
+ p_content_list.append(f"{r_pre}{r_text}{r_suf}")
70
+
71
+ p_content = "".join(p_content_list)
72
+ cell_text_parts.append(prefix + p_content)
73
+
74
+ cell_text = "\n".join(cell_text_parts)
75
+ if cell_text:
76
+ row_parts.append(cell_text)
77
+
78
+ if row_parts:
79
+ full_text.append(" | ".join(row_parts))
80
+
81
+ return "\n\n".join(full_text)
82
+
83
+ except Exception as e:
84
+ logger.error(f"Text extraction failed: {e}", exc_info=True)
85
+ raise ValueError(f"Could not extract text: {str(e)}") from e