docspan 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docspan/__init__.py +3 -0
- docspan/__main__.py +0 -0
- docspan/backends/__init__.py +19 -0
- docspan/backends/base.py +85 -0
- docspan/backends/confluence/__init__.py +0 -0
- docspan/backends/confluence/adf/__init__.py +14 -0
- docspan/backends/confluence/adf/comparator.py +427 -0
- docspan/backends/confluence/adf/converter.py +119 -0
- docspan/backends/confluence/adf/converters.py +1449 -0
- docspan/backends/confluence/adf/interfaces.py +191 -0
- docspan/backends/confluence/adf/nodes.py +2085 -0
- docspan/backends/confluence/adf/parser.py +400 -0
- docspan/backends/confluence/adf/validators.py +161 -0
- docspan/backends/confluence/adf/visitors.py +495 -0
- docspan/backends/confluence/backend.py +227 -0
- docspan/backends/confluence/client.py +44 -0
- docspan/backends/confluence/config/__init__.py +21 -0
- docspan/backends/confluence/config/loader.py +107 -0
- docspan/backends/confluence/config/models.py +167 -0
- docspan/backends/confluence/config/validation.py +297 -0
- docspan/backends/confluence/markdown/__init__.py +22 -0
- docspan/backends/confluence/markdown/ast.py +819 -0
- docspan/backends/confluence/markdown/extensions/__init__.py +5 -0
- docspan/backends/confluence/markdown/extensions/frontmatter.py +80 -0
- docspan/backends/confluence/markdown/extensions/mermaid.py +64 -0
- docspan/backends/confluence/markdown/extensions/wikilinks.py +179 -0
- docspan/backends/confluence/markdown/inline_parser.py +495 -0
- docspan/backends/confluence/markdown/parser.py +1006 -0
- docspan/backends/confluence/models/__init__.py +18 -0
- docspan/backends/confluence/models/markdown_file.py +402 -0
- docspan/backends/confluence/models/page.py +212 -0
- docspan/backends/confluence/models/path_utils.py +34 -0
- docspan/backends/confluence/models/results.py +28 -0
- docspan/backends/confluence/models/sync_status.py +382 -0
- docspan/backends/confluence/services/__init__.py +0 -0
- docspan/backends/confluence/services/confluence/__init__.py +40 -0
- docspan/backends/confluence/services/confluence/attachment_client.py +147 -0
- docspan/backends/confluence/services/confluence/base_client.py +420 -0
- docspan/backends/confluence/services/confluence/client.py +376 -0
- docspan/backends/confluence/services/confluence/comment_client.py +682 -0
- docspan/backends/confluence/services/confluence/crawler.py +587 -0
- docspan/backends/confluence/services/confluence/label_client.py +130 -0
- docspan/backends/confluence/services/confluence/page_client.py +1288 -0
- docspan/backends/confluence/services/confluence/space_client.py +179 -0
- docspan/backends/confluence/services/confluence/url_parser.py +106 -0
- docspan/backends/google_docs/__init__.py +0 -0
- docspan/backends/google_docs/auth.py +143 -0
- docspan/backends/google_docs/backend.py +140 -0
- docspan/backends/google_docs/client.py +665 -0
- docspan/backends/google_docs/converter.py +471 -0
- docspan/backends/google_docs/docs_request_builder.py +232 -0
- docspan/backends/google_docs/docs_structure_parser.py +120 -0
- docspan/backends/google_docs/markdown_to_paragraph_parser.py +145 -0
- docspan/cli/__init__.py +0 -0
- docspan/cli/main.py +408 -0
- docspan/config.py +62 -0
- docspan/core/__init__.py +49 -0
- docspan/core/merge.py +30 -0
- docspan/core/orchestrator.py +332 -0
- docspan/core/paths.py +8 -0
- docspan/core/state.py +53 -0
- docspan-0.1.0.dist-info/METADATA +273 -0
- docspan-0.1.0.dist-info/RECORD +65 -0
- docspan-0.1.0.dist-info/WHEEL +4 -0
- docspan-0.1.0.dist-info/entry_points.txt +2 -0
docspan/__init__.py
ADDED
docspan/__main__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Backend registry — maps backend names to their classes."""
|
|
2
|
+
|
|
3
|
+
from docspan.backends.base import Backend, PullResult, PushResult, RemoteDoc, SyncDirection
|
|
4
|
+
from docspan.backends.confluence.backend import ConfluenceBackend
|
|
5
|
+
from docspan.backends.google_docs.backend import GoogleDocsBackend
|
|
6
|
+
|
|
7
|
+
BACKENDS: dict[str, type[Backend]] = {
|
|
8
|
+
"google_docs": GoogleDocsBackend,
|
|
9
|
+
"confluence": ConfluenceBackend,
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"Backend",
|
|
14
|
+
"SyncDirection",
|
|
15
|
+
"RemoteDoc",
|
|
16
|
+
"PushResult",
|
|
17
|
+
"PullResult",
|
|
18
|
+
"BACKENDS",
|
|
19
|
+
]
|
docspan/backends/base.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Abstract backend interface. Every platform adapter implements this."""
|
|
2
|
+
|
|
3
|
+
import inspect
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from typing import Literal, Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SyncDirection(str, Enum):
|
|
11
|
+
PUSH = "push" # local markdown → remote doc
|
|
12
|
+
PULL = "pull" # remote doc → local markdown
|
|
13
|
+
BOTH = "both"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class RemoteDoc:
|
|
18
|
+
"""Represents a remote document retrieved from a platform."""
|
|
19
|
+
doc_id: str
|
|
20
|
+
title: str
|
|
21
|
+
content_markdown: str
|
|
22
|
+
last_modified: Optional[str] = None
|
|
23
|
+
url: Optional[str] = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class PushResult:
|
|
28
|
+
status: Literal["ok", "conflict", "error", "skipped"]
|
|
29
|
+
doc_id: str
|
|
30
|
+
message: Optional[str] = None
|
|
31
|
+
url: Optional[str] = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class PullResult:
|
|
36
|
+
status: Literal["ok", "conflict", "error", "skipped"]
|
|
37
|
+
doc_id: str
|
|
38
|
+
local_path: str
|
|
39
|
+
message: Optional[str] = None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class Backend(ABC):
|
|
43
|
+
"""
|
|
44
|
+
Base class for all docspan platform backends.
|
|
45
|
+
|
|
46
|
+
Implementing a new backend:
|
|
47
|
+
1. Subclass Backend
|
|
48
|
+
2. Set class attribute `name` (e.g. name = "my_backend")
|
|
49
|
+
3. Implement push(), pull(), auth_setup(), and validate_config()
|
|
50
|
+
4. Register in src/docspan/backends/__init__.py
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
name: str # must be overridden in every concrete subclass
|
|
54
|
+
|
|
55
|
+
def __init_subclass__(cls, **kwargs: object) -> None:
|
|
56
|
+
super().__init_subclass__(**kwargs)
|
|
57
|
+
if not inspect.isabstract(cls) and "name" not in cls.__dict__:
|
|
58
|
+
raise TypeError(
|
|
59
|
+
f"{cls.__name__} must define a 'name' class attribute (e.g. name = 'my_backend')"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
@abstractmethod
|
|
63
|
+
def push(self, local_path: str, doc_id: str, **kwargs) -> PushResult:
|
|
64
|
+
"""Convert local markdown file and update the remote document."""
|
|
65
|
+
|
|
66
|
+
@abstractmethod
|
|
67
|
+
def pull(self, doc_id: str, local_path: str, **kwargs) -> PullResult:
|
|
68
|
+
"""Fetch the remote document and write it as local markdown."""
|
|
69
|
+
|
|
70
|
+
@abstractmethod
|
|
71
|
+
def auth_setup(self) -> None:
|
|
72
|
+
"""Interactive / instructional setup wizard for this backend."""
|
|
73
|
+
|
|
74
|
+
@abstractmethod
|
|
75
|
+
def get_remote_version(self, doc_id: str) -> str:
|
|
76
|
+
"""
|
|
77
|
+
Return an opaque version token for the current remote document state.
|
|
78
|
+
- Google Docs: returns doc['revisionId'] (opaque string)
|
|
79
|
+
- Confluence: returns str(page['version']['number']) (monotonic integer as string)
|
|
80
|
+
Used by == comparison to detect remote changes between syncs.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
@abstractmethod
|
|
84
|
+
def validate_config(self) -> None:
|
|
85
|
+
"""Raise ValueError with a clear message if config is missing required keys."""
|
|
File without changes
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Atlassian Document Format (ADF) conversion module.
|
|
3
|
+
|
|
4
|
+
This module provides components for converting Markdown AST to
|
|
5
|
+
Atlassian Document Format (ADF).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from docspan.backends.confluence.adf.converter import AdfConverter
|
|
9
|
+
from docspan.backends.confluence.adf.nodes import AdfNode
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"AdfConverter",
|
|
13
|
+
"AdfNode",
|
|
14
|
+
]
|
|
@@ -0,0 +1,427 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Markdown-to-ADF comparison tool.
|
|
3
|
+
|
|
4
|
+
This module provides functionality to compare markdown input with generated ADF output
|
|
5
|
+
to identify discrepancies and issues in the conversion process.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import difflib
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from enum import Enum
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
from docspan.backends.confluence.adf.converter import AdfConverter
|
|
15
|
+
from docspan.backends.confluence.adf.parser import AdfDocument, AdfNodeType, AdfParser
|
|
16
|
+
from docspan.backends.confluence.markdown.ast import MarkdownNode
|
|
17
|
+
from docspan.backends.confluence.markdown.parser import MarkdownParser
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DifferenceType(Enum):
|
|
21
|
+
"""Types of differences that can be found."""
|
|
22
|
+
|
|
23
|
+
MISSING_CONTENT = "missing_content"
|
|
24
|
+
EXTRA_CONTENT = "extra_content"
|
|
25
|
+
STRUCTURE_MISMATCH = "structure_mismatch"
|
|
26
|
+
TEXT_MISMATCH = "text_mismatch"
|
|
27
|
+
ATTRIBUTE_MISMATCH = "attribute_mismatch"
|
|
28
|
+
LINK_MISMATCH = "link_mismatch"
|
|
29
|
+
FORMAT_MISMATCH = "format_mismatch"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class Difference:
|
|
34
|
+
"""Represents a difference between markdown and ADF."""
|
|
35
|
+
|
|
36
|
+
type: DifferenceType
|
|
37
|
+
location: str
|
|
38
|
+
expected: Any
|
|
39
|
+
actual: Any
|
|
40
|
+
severity: str = "medium" # low, medium, high
|
|
41
|
+
description: Optional[str] = None
|
|
42
|
+
|
|
43
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
44
|
+
"""Convert to dictionary."""
|
|
45
|
+
return {
|
|
46
|
+
"type": self.type.value,
|
|
47
|
+
"location": self.location,
|
|
48
|
+
"expected": str(self.expected),
|
|
49
|
+
"actual": str(self.actual),
|
|
50
|
+
"severity": self.severity,
|
|
51
|
+
"description": self.description,
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class ComparisonReport:
|
|
57
|
+
"""Report from comparing markdown and ADF."""
|
|
58
|
+
|
|
59
|
+
markdown_file: Optional[Path] = None
|
|
60
|
+
total_differences: int = 0
|
|
61
|
+
differences: List[Difference] = field(default_factory=list)
|
|
62
|
+
markdown_stats: Dict[str, Any] = field(default_factory=dict)
|
|
63
|
+
adf_stats: Dict[str, Any] = field(default_factory=dict)
|
|
64
|
+
success: bool = True
|
|
65
|
+
|
|
66
|
+
def add_difference(
|
|
67
|
+
self,
|
|
68
|
+
diff_type: DifferenceType,
|
|
69
|
+
location: str,
|
|
70
|
+
expected: Any,
|
|
71
|
+
actual: Any,
|
|
72
|
+
severity: str = "medium",
|
|
73
|
+
description: Optional[str] = None,
|
|
74
|
+
) -> None:
|
|
75
|
+
"""Add a difference to the report."""
|
|
76
|
+
diff = Difference(
|
|
77
|
+
type=diff_type,
|
|
78
|
+
location=location,
|
|
79
|
+
expected=expected,
|
|
80
|
+
actual=actual,
|
|
81
|
+
severity=severity,
|
|
82
|
+
description=description,
|
|
83
|
+
)
|
|
84
|
+
self.differences.append(diff)
|
|
85
|
+
self.total_differences += 1
|
|
86
|
+
self.success = False
|
|
87
|
+
|
|
88
|
+
def get_differences_by_type(self, diff_type: DifferenceType) -> List[Difference]:
|
|
89
|
+
"""Get all differences of a specific type."""
|
|
90
|
+
return [d for d in self.differences if d.type == diff_type]
|
|
91
|
+
|
|
92
|
+
def get_differences_by_severity(self, severity: str) -> List[Difference]:
|
|
93
|
+
"""Get all differences of a specific severity."""
|
|
94
|
+
return [d for d in self.differences if d.severity == severity]
|
|
95
|
+
|
|
96
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
97
|
+
"""Convert to dictionary."""
|
|
98
|
+
return {
|
|
99
|
+
"markdown_file": str(self.markdown_file) if self.markdown_file else None,
|
|
100
|
+
"success": self.success,
|
|
101
|
+
"total_differences": self.total_differences,
|
|
102
|
+
"differences": [d.to_dict() for d in self.differences],
|
|
103
|
+
"differences_by_type": {
|
|
104
|
+
diff_type.value: len(self.get_differences_by_type(diff_type))
|
|
105
|
+
for diff_type in DifferenceType
|
|
106
|
+
},
|
|
107
|
+
"differences_by_severity": {
|
|
108
|
+
severity: len(self.get_differences_by_severity(severity))
|
|
109
|
+
for severity in ["low", "medium", "high"]
|
|
110
|
+
},
|
|
111
|
+
"markdown_stats": self.markdown_stats,
|
|
112
|
+
"adf_stats": self.adf_stats,
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class MarkdownAdfComparator:
|
|
117
|
+
"""
|
|
118
|
+
Compare markdown input with generated ADF output.
|
|
119
|
+
|
|
120
|
+
This class provides methods to identify discrepancies between markdown
|
|
121
|
+
source and the ADF that was generated from it.
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
def __init__(self):
|
|
125
|
+
"""Initialize the comparator."""
|
|
126
|
+
self.markdown_parser = MarkdownParser()
|
|
127
|
+
self.adf_converter = AdfConverter()
|
|
128
|
+
self.adf_parser = AdfParser()
|
|
129
|
+
|
|
130
|
+
def compare_from_markdown(
|
|
131
|
+
self,
|
|
132
|
+
markdown_content: str,
|
|
133
|
+
markdown_file: Optional[Path] = None,
|
|
134
|
+
) -> ComparisonReport:
|
|
135
|
+
"""
|
|
136
|
+
Compare markdown content with its generated ADF.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
markdown_content: Markdown content to convert and compare
|
|
140
|
+
markdown_file: Optional path to the markdown file
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Comparison report
|
|
144
|
+
"""
|
|
145
|
+
report = ComparisonReport(markdown_file=markdown_file)
|
|
146
|
+
|
|
147
|
+
try:
|
|
148
|
+
# Parse markdown
|
|
149
|
+
md_nodes = self.markdown_parser.parse(markdown_content)
|
|
150
|
+
|
|
151
|
+
# Convert to ADF
|
|
152
|
+
adf_dict = self.adf_converter.convert(md_nodes)
|
|
153
|
+
|
|
154
|
+
# Parse ADF back
|
|
155
|
+
adf_doc = self.adf_parser.parse(adf_dict)
|
|
156
|
+
|
|
157
|
+
# Collect statistics
|
|
158
|
+
report.markdown_stats = self._get_markdown_stats(md_nodes)
|
|
159
|
+
report.adf_stats = adf_doc.get_node_statistics()
|
|
160
|
+
|
|
161
|
+
# Compare structure
|
|
162
|
+
self._compare_structure(md_nodes, adf_doc, report)
|
|
163
|
+
|
|
164
|
+
# Compare text content
|
|
165
|
+
self._compare_text_content(md_nodes, adf_doc, report)
|
|
166
|
+
|
|
167
|
+
# Compare links
|
|
168
|
+
self._compare_links(md_nodes, adf_doc, report)
|
|
169
|
+
|
|
170
|
+
except Exception as e:
|
|
171
|
+
report.add_difference(
|
|
172
|
+
DifferenceType.STRUCTURE_MISMATCH,
|
|
173
|
+
"root",
|
|
174
|
+
"valid conversion",
|
|
175
|
+
f"conversion failed: {e}",
|
|
176
|
+
severity="high",
|
|
177
|
+
description=str(e),
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
return report
|
|
181
|
+
|
|
182
|
+
def compare_with_existing_adf(
|
|
183
|
+
self,
|
|
184
|
+
markdown_content: str,
|
|
185
|
+
existing_adf: Dict[str, Any],
|
|
186
|
+
markdown_file: Optional[Path] = None,
|
|
187
|
+
) -> ComparisonReport:
|
|
188
|
+
"""
|
|
189
|
+
Compare markdown with an existing ADF document.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
markdown_content: Markdown source content
|
|
193
|
+
existing_adf: Existing ADF document (from Confluence)
|
|
194
|
+
markdown_file: Optional path to the markdown file
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
Comparison report
|
|
198
|
+
"""
|
|
199
|
+
report = ComparisonReport(markdown_file=markdown_file)
|
|
200
|
+
|
|
201
|
+
try:
|
|
202
|
+
# Parse markdown
|
|
203
|
+
md_nodes = self.markdown_parser.parse(markdown_content)
|
|
204
|
+
|
|
205
|
+
# Convert to ADF
|
|
206
|
+
generated_adf_dict = self.adf_converter.convert(md_nodes)
|
|
207
|
+
|
|
208
|
+
# Parse both ADF documents
|
|
209
|
+
generated_adf = self.adf_parser.parse(generated_adf_dict)
|
|
210
|
+
existing_adf_doc = self.adf_parser.parse(existing_adf)
|
|
211
|
+
|
|
212
|
+
# Collect statistics
|
|
213
|
+
report.markdown_stats = self._get_markdown_stats(md_nodes)
|
|
214
|
+
report.adf_stats = {
|
|
215
|
+
"generated": generated_adf.get_node_statistics(),
|
|
216
|
+
"existing": existing_adf_doc.get_node_statistics(),
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
# Compare the two ADF documents
|
|
220
|
+
self._compare_adf_documents(generated_adf, existing_adf_doc, report)
|
|
221
|
+
|
|
222
|
+
except Exception as e:
|
|
223
|
+
report.add_difference(
|
|
224
|
+
DifferenceType.STRUCTURE_MISMATCH,
|
|
225
|
+
"root",
|
|
226
|
+
"successful comparison",
|
|
227
|
+
f"comparison failed: {e}",
|
|
228
|
+
severity="high",
|
|
229
|
+
description=str(e),
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
return report
|
|
233
|
+
|
|
234
|
+
def _compare_structure(
|
|
235
|
+
self,
|
|
236
|
+
md_nodes: List[MarkdownNode],
|
|
237
|
+
adf_doc: AdfDocument,
|
|
238
|
+
report: ComparisonReport,
|
|
239
|
+
) -> None:
|
|
240
|
+
"""Compare the overall structure of markdown and ADF."""
|
|
241
|
+
# Count major structural elements
|
|
242
|
+
md_headings = len([n for n in self._flatten_md_nodes(md_nodes) if n.type == "heading"])
|
|
243
|
+
md_lists = len([n for n in self._flatten_md_nodes(md_nodes) if n.type in ["bulletList", "orderedList"]])
|
|
244
|
+
md_code_blocks = len([n for n in self._flatten_md_nodes(md_nodes) if n.type == "codeBlock"])
|
|
245
|
+
|
|
246
|
+
adf_headings = len(adf_doc.root.find_nodes_by_type(AdfNodeType.HEADING))
|
|
247
|
+
adf_lists = len(adf_doc.root.find_nodes_by_type(AdfNodeType.BULLET_LIST)) + len(
|
|
248
|
+
adf_doc.root.find_nodes_by_type(AdfNodeType.ORDERED_LIST)
|
|
249
|
+
)
|
|
250
|
+
adf_code_blocks = len(adf_doc.root.find_nodes_by_type(AdfNodeType.CODE_BLOCK))
|
|
251
|
+
|
|
252
|
+
# Compare counts
|
|
253
|
+
if md_headings != adf_headings:
|
|
254
|
+
report.add_difference(
|
|
255
|
+
DifferenceType.STRUCTURE_MISMATCH,
|
|
256
|
+
"headings",
|
|
257
|
+
md_headings,
|
|
258
|
+
adf_headings,
|
|
259
|
+
severity="high",
|
|
260
|
+
description=f"Heading count mismatch: {md_headings} in markdown vs {adf_headings} in ADF",
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
if md_lists != adf_lists:
|
|
264
|
+
report.add_difference(
|
|
265
|
+
DifferenceType.STRUCTURE_MISMATCH,
|
|
266
|
+
"lists",
|
|
267
|
+
md_lists,
|
|
268
|
+
adf_lists,
|
|
269
|
+
severity="medium",
|
|
270
|
+
description=f"List count mismatch: {md_lists} in markdown vs {adf_lists} in ADF",
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
if md_code_blocks != adf_code_blocks:
|
|
274
|
+
report.add_difference(
|
|
275
|
+
DifferenceType.STRUCTURE_MISMATCH,
|
|
276
|
+
"code_blocks",
|
|
277
|
+
md_code_blocks,
|
|
278
|
+
adf_code_blocks,
|
|
279
|
+
severity="medium",
|
|
280
|
+
description=f"Code block count mismatch: {md_code_blocks} in markdown vs {adf_code_blocks} in ADF",
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
def _compare_text_content(
|
|
284
|
+
self,
|
|
285
|
+
md_nodes: List[MarkdownNode],
|
|
286
|
+
adf_doc: AdfDocument,
|
|
287
|
+
report: ComparisonReport,
|
|
288
|
+
) -> None:
|
|
289
|
+
"""Compare text content between markdown and ADF."""
|
|
290
|
+
# Get text from markdown
|
|
291
|
+
md_text = self._extract_markdown_text(md_nodes)
|
|
292
|
+
|
|
293
|
+
# Get text from ADF
|
|
294
|
+
adf_text = adf_doc.get_all_text()
|
|
295
|
+
|
|
296
|
+
# Normalize whitespace for comparison
|
|
297
|
+
md_text_normalized = " ".join(md_text.split())
|
|
298
|
+
adf_text_normalized = " ".join(adf_text.split())
|
|
299
|
+
|
|
300
|
+
# Calculate similarity
|
|
301
|
+
similarity = difflib.SequenceMatcher(None, md_text_normalized, adf_text_normalized).ratio()
|
|
302
|
+
|
|
303
|
+
if similarity < 0.95: # Less than 95% similar
|
|
304
|
+
report.add_difference(
|
|
305
|
+
DifferenceType.TEXT_MISMATCH,
|
|
306
|
+
"text_content",
|
|
307
|
+
"similarity: 100%",
|
|
308
|
+
f"similarity: {similarity * 100:.1f}%",
|
|
309
|
+
severity="high" if similarity < 0.8 else "medium",
|
|
310
|
+
description=f"Text content similarity is {similarity * 100:.1f}%",
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
def _compare_links(
|
|
314
|
+
self,
|
|
315
|
+
md_nodes: List[MarkdownNode],
|
|
316
|
+
adf_doc: AdfDocument,
|
|
317
|
+
report: ComparisonReport,
|
|
318
|
+
) -> None:
|
|
319
|
+
"""Compare links between markdown and ADF."""
|
|
320
|
+
# Extract links from markdown
|
|
321
|
+
md_links = self._extract_markdown_links(md_nodes)
|
|
322
|
+
|
|
323
|
+
# Get links from ADF
|
|
324
|
+
adf_links = adf_doc.find_links()
|
|
325
|
+
|
|
326
|
+
# Compare counts
|
|
327
|
+
if len(md_links) != len(adf_links):
|
|
328
|
+
report.add_difference(
|
|
329
|
+
DifferenceType.LINK_MISMATCH,
|
|
330
|
+
"links",
|
|
331
|
+
len(md_links),
|
|
332
|
+
len(adf_links),
|
|
333
|
+
severity="medium",
|
|
334
|
+
description=f"Link count mismatch: {len(md_links)} in markdown vs {len(adf_links)} in ADF",
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
def _compare_adf_documents(
|
|
338
|
+
self,
|
|
339
|
+
generated: AdfDocument,
|
|
340
|
+
existing: AdfDocument,
|
|
341
|
+
report: ComparisonReport,
|
|
342
|
+
) -> None:
|
|
343
|
+
"""Compare two ADF documents."""
|
|
344
|
+
# Compare node statistics
|
|
345
|
+
gen_stats = generated.get_node_statistics()
|
|
346
|
+
exist_stats = existing.get_node_statistics()
|
|
347
|
+
|
|
348
|
+
for node_type, count in gen_stats.items():
|
|
349
|
+
existing_count = exist_stats.get(node_type, 0)
|
|
350
|
+
if count != existing_count:
|
|
351
|
+
report.add_difference(
|
|
352
|
+
DifferenceType.STRUCTURE_MISMATCH,
|
|
353
|
+
f"node_count.{node_type}",
|
|
354
|
+
count,
|
|
355
|
+
existing_count,
|
|
356
|
+
severity="medium",
|
|
357
|
+
description=f"Node count for {node_type}: {count} generated vs {existing_count} existing",
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
# Compare text content
|
|
361
|
+
gen_text = generated.get_all_text()
|
|
362
|
+
exist_text = existing.get_all_text()
|
|
363
|
+
|
|
364
|
+
if gen_text != exist_text:
|
|
365
|
+
similarity = difflib.SequenceMatcher(None, gen_text, exist_text).ratio()
|
|
366
|
+
report.add_difference(
|
|
367
|
+
DifferenceType.TEXT_MISMATCH,
|
|
368
|
+
"text_content",
|
|
369
|
+
f"generated text ({len(gen_text)} chars)",
|
|
370
|
+
f"existing text ({len(exist_text)} chars)",
|
|
371
|
+
severity="high" if similarity < 0.8 else "medium",
|
|
372
|
+
description=f"Text similarity: {similarity * 100:.1f}%",
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
def _get_markdown_stats(self, nodes: List[MarkdownNode]) -> Dict[str, Any]:
|
|
376
|
+
"""Get statistics about markdown nodes."""
|
|
377
|
+
all_nodes = self._flatten_md_nodes(nodes)
|
|
378
|
+
node_types = {}
|
|
379
|
+
|
|
380
|
+
for node in all_nodes:
|
|
381
|
+
node_type = node.type # type is already a string, not an enum
|
|
382
|
+
node_types[node_type] = node_types.get(node_type, 0) + 1
|
|
383
|
+
|
|
384
|
+
return {
|
|
385
|
+
"total_nodes": len(all_nodes),
|
|
386
|
+
"node_types": node_types,
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
def _flatten_md_nodes(self, nodes: List[MarkdownNode]) -> List[MarkdownNode]:
|
|
390
|
+
"""Flatten markdown node tree into a list."""
|
|
391
|
+
result = []
|
|
392
|
+
|
|
393
|
+
for node in nodes:
|
|
394
|
+
result.append(node)
|
|
395
|
+
if hasattr(node, 'children') and node.children:
|
|
396
|
+
result.extend(self._flatten_md_nodes(node.children))
|
|
397
|
+
|
|
398
|
+
return result
|
|
399
|
+
|
|
400
|
+
def _extract_markdown_text(self, nodes: List[MarkdownNode]) -> str:
|
|
401
|
+
"""Extract all text from markdown nodes."""
|
|
402
|
+
text_parts = []
|
|
403
|
+
|
|
404
|
+
for node in nodes:
|
|
405
|
+
if hasattr(node, 'content') and node.content:
|
|
406
|
+
text_parts.append(node.content)
|
|
407
|
+
|
|
408
|
+
if hasattr(node, 'children') and node.children:
|
|
409
|
+
text_parts.append(self._extract_markdown_text(node.children))
|
|
410
|
+
|
|
411
|
+
return " ".join(text_parts)
|
|
412
|
+
|
|
413
|
+
def _extract_markdown_links(self, nodes: List[MarkdownNode]) -> List[Dict[str, Any]]:
|
|
414
|
+
"""Extract all links from markdown nodes."""
|
|
415
|
+
links = []
|
|
416
|
+
|
|
417
|
+
for node in self._flatten_md_nodes(nodes):
|
|
418
|
+
if node.type == "link":
|
|
419
|
+
links.append(
|
|
420
|
+
{
|
|
421
|
+
"url": getattr(node, 'url', None),
|
|
422
|
+
"title": getattr(node, 'title', None),
|
|
423
|
+
"text": getattr(node, 'content', None),
|
|
424
|
+
}
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
return links
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Converter for transforming Markdown AST to ADF.
|
|
3
|
+
|
|
4
|
+
This module provides the main entry point for converting Markdown AST to
|
|
5
|
+
Atlassian Document Format (ADF).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Any, Dict, List
|
|
9
|
+
|
|
10
|
+
from docspan.backends.confluence.adf.converters import ConverterFactory
|
|
11
|
+
from docspan.backends.confluence.adf.nodes import AdfBuilder
|
|
12
|
+
from docspan.backends.confluence.markdown.ast import MarkdownNode
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class AdfConverter:
|
|
16
|
+
"""
|
|
17
|
+
Convert Markdown AST to Atlassian Document Format (ADF).
|
|
18
|
+
|
|
19
|
+
This class serves as a facade for the ADF conversion system, maintaining
|
|
20
|
+
backward compatibility with existing code while delegating to the new
|
|
21
|
+
visitor-based conversion system.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self) -> None:
|
|
25
|
+
"""Initialize the converter."""
|
|
26
|
+
self.builder = AdfBuilder()
|
|
27
|
+
self.converter = ConverterFactory.create_converter()
|
|
28
|
+
|
|
29
|
+
def convert(self, nodes: List[MarkdownNode], title: str = None, skip_first_h1_matching_title: bool = False) -> Dict[str, Any]:
|
|
30
|
+
"""
|
|
31
|
+
Convert Markdown nodes to ADF.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
nodes: List of Markdown AST nodes
|
|
35
|
+
title: Optional page title to compare against first H1
|
|
36
|
+
skip_first_h1_matching_title: If True, skip the first H1 heading that matches the title
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
ADF document as a dictionary
|
|
40
|
+
"""
|
|
41
|
+
# Filter nodes to skip first H1 if it matches the title
|
|
42
|
+
filtered_nodes = self._filter_duplicate_title_heading(nodes, title, skip_first_h1_matching_title)
|
|
43
|
+
|
|
44
|
+
# Get the standard ADF document
|
|
45
|
+
adf = self.converter.convert(filtered_nodes)
|
|
46
|
+
|
|
47
|
+
# Check if any nodes have HTML that needs to be inserted into the storage format
|
|
48
|
+
# This is a bit of a hack, but it allows us to inject raw HTML for things like iframes
|
|
49
|
+
# that aren't directly supported in ADF
|
|
50
|
+
for node in filtered_nodes:
|
|
51
|
+
if hasattr(node, 'storage_format_html') and node.storage_format_html:
|
|
52
|
+
# Store the HTML content to be handled during the page update process
|
|
53
|
+
# This information will be picked up by the page client
|
|
54
|
+
if not hasattr(adf, 'storage_format_html'):
|
|
55
|
+
adf['storage_format_html'] = []
|
|
56
|
+
adf['storage_format_html'].append(node.storage_format_html)
|
|
57
|
+
|
|
58
|
+
return adf
|
|
59
|
+
|
|
60
|
+
def _filter_duplicate_title_heading(
|
|
61
|
+
self,
|
|
62
|
+
nodes: List[MarkdownNode],
|
|
63
|
+
title: str = None,
|
|
64
|
+
skip_first_h1: bool = False
|
|
65
|
+
) -> List[MarkdownNode]:
|
|
66
|
+
"""
|
|
67
|
+
Filter out the first H1 heading if it matches the page title.
|
|
68
|
+
|
|
69
|
+
This prevents duplicate titles when the title is extracted from the first H1
|
|
70
|
+
and then that same H1 appears in the content.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
nodes: List of Markdown AST nodes
|
|
74
|
+
title: The page title to compare against
|
|
75
|
+
skip_first_h1: Whether to perform filtering
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Filtered list of nodes
|
|
79
|
+
"""
|
|
80
|
+
if not skip_first_h1 or not title or not nodes:
|
|
81
|
+
return nodes
|
|
82
|
+
|
|
83
|
+
# Find the first H1 heading
|
|
84
|
+
from docspan.backends.confluence.markdown.ast import HeadingNode
|
|
85
|
+
|
|
86
|
+
for i, node in enumerate(nodes):
|
|
87
|
+
if isinstance(node, HeadingNode) and node.level == 1:
|
|
88
|
+
# Extract text from the heading's children
|
|
89
|
+
heading_text = self._extract_text_from_nodes(node.children)
|
|
90
|
+
|
|
91
|
+
# If it matches the title, skip this node
|
|
92
|
+
if heading_text.strip() == title.strip():
|
|
93
|
+
return nodes[:i] + nodes[i+1:]
|
|
94
|
+
|
|
95
|
+
# If we found an H1 that doesn't match, don't filter anything
|
|
96
|
+
break
|
|
97
|
+
|
|
98
|
+
return nodes
|
|
99
|
+
|
|
100
|
+
def _extract_text_from_nodes(self, nodes: List[MarkdownNode]) -> str:
|
|
101
|
+
"""
|
|
102
|
+
Extract plain text from a list of nodes (recursively).
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
nodes: List of nodes to extract text from
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Concatenated text content
|
|
109
|
+
"""
|
|
110
|
+
from docspan.backends.confluence.markdown.ast import TextNode
|
|
111
|
+
|
|
112
|
+
text_parts = []
|
|
113
|
+
for node in nodes:
|
|
114
|
+
if isinstance(node, TextNode):
|
|
115
|
+
text_parts.append(node.content)
|
|
116
|
+
elif hasattr(node, 'children'):
|
|
117
|
+
text_parts.append(self._extract_text_from_nodes(node.children))
|
|
118
|
+
|
|
119
|
+
return ''.join(text_parts)
|