docspan 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. docspan/__init__.py +3 -0
  2. docspan/__main__.py +0 -0
  3. docspan/backends/__init__.py +19 -0
  4. docspan/backends/base.py +85 -0
  5. docspan/backends/confluence/__init__.py +0 -0
  6. docspan/backends/confluence/adf/__init__.py +14 -0
  7. docspan/backends/confluence/adf/comparator.py +427 -0
  8. docspan/backends/confluence/adf/converter.py +119 -0
  9. docspan/backends/confluence/adf/converters.py +1449 -0
  10. docspan/backends/confluence/adf/interfaces.py +191 -0
  11. docspan/backends/confluence/adf/nodes.py +2085 -0
  12. docspan/backends/confluence/adf/parser.py +400 -0
  13. docspan/backends/confluence/adf/validators.py +161 -0
  14. docspan/backends/confluence/adf/visitors.py +495 -0
  15. docspan/backends/confluence/backend.py +227 -0
  16. docspan/backends/confluence/client.py +44 -0
  17. docspan/backends/confluence/config/__init__.py +21 -0
  18. docspan/backends/confluence/config/loader.py +107 -0
  19. docspan/backends/confluence/config/models.py +167 -0
  20. docspan/backends/confluence/config/validation.py +297 -0
  21. docspan/backends/confluence/markdown/__init__.py +22 -0
  22. docspan/backends/confluence/markdown/ast.py +819 -0
  23. docspan/backends/confluence/markdown/extensions/__init__.py +5 -0
  24. docspan/backends/confluence/markdown/extensions/frontmatter.py +80 -0
  25. docspan/backends/confluence/markdown/extensions/mermaid.py +64 -0
  26. docspan/backends/confluence/markdown/extensions/wikilinks.py +179 -0
  27. docspan/backends/confluence/markdown/inline_parser.py +495 -0
  28. docspan/backends/confluence/markdown/parser.py +1006 -0
  29. docspan/backends/confluence/models/__init__.py +18 -0
  30. docspan/backends/confluence/models/markdown_file.py +402 -0
  31. docspan/backends/confluence/models/page.py +212 -0
  32. docspan/backends/confluence/models/path_utils.py +34 -0
  33. docspan/backends/confluence/models/results.py +28 -0
  34. docspan/backends/confluence/models/sync_status.py +382 -0
  35. docspan/backends/confluence/services/__init__.py +0 -0
  36. docspan/backends/confluence/services/confluence/__init__.py +40 -0
  37. docspan/backends/confluence/services/confluence/attachment_client.py +147 -0
  38. docspan/backends/confluence/services/confluence/base_client.py +420 -0
  39. docspan/backends/confluence/services/confluence/client.py +376 -0
  40. docspan/backends/confluence/services/confluence/comment_client.py +682 -0
  41. docspan/backends/confluence/services/confluence/crawler.py +587 -0
  42. docspan/backends/confluence/services/confluence/label_client.py +130 -0
  43. docspan/backends/confluence/services/confluence/page_client.py +1288 -0
  44. docspan/backends/confluence/services/confluence/space_client.py +179 -0
  45. docspan/backends/confluence/services/confluence/url_parser.py +106 -0
  46. docspan/backends/google_docs/__init__.py +0 -0
  47. docspan/backends/google_docs/auth.py +143 -0
  48. docspan/backends/google_docs/backend.py +140 -0
  49. docspan/backends/google_docs/client.py +665 -0
  50. docspan/backends/google_docs/converter.py +471 -0
  51. docspan/backends/google_docs/docs_request_builder.py +232 -0
  52. docspan/backends/google_docs/docs_structure_parser.py +120 -0
  53. docspan/backends/google_docs/markdown_to_paragraph_parser.py +145 -0
  54. docspan/cli/__init__.py +0 -0
  55. docspan/cli/main.py +408 -0
  56. docspan/config.py +62 -0
  57. docspan/core/__init__.py +49 -0
  58. docspan/core/merge.py +30 -0
  59. docspan/core/orchestrator.py +332 -0
  60. docspan/core/paths.py +8 -0
  61. docspan/core/state.py +53 -0
  62. docspan-0.1.0.dist-info/METADATA +273 -0
  63. docspan-0.1.0.dist-info/RECORD +65 -0
  64. docspan-0.1.0.dist-info/WHEEL +4 -0
  65. docspan-0.1.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,400 @@
1
+ """
2
+ ADF (Atlassian Document Format) parser and analyzer.
3
+
4
+ This module provides functionality to parse, validate, and analyze ADF documents
5
+ extracted from Confluence pages.
6
+ """
7
+
8
+ import json
9
+ from dataclasses import dataclass, field
10
+ from enum import Enum
11
+ from pathlib import Path
12
+ from typing import Any, Dict, List, Optional, Set
13
+
14
+
15
+ class AdfNodeType(Enum):
16
+ """ADF node types."""
17
+
18
+ DOC = "doc"
19
+ PARAGRAPH = "paragraph"
20
+ TEXT = "text"
21
+ HEADING = "heading"
22
+ BULLET_LIST = "bulletList"
23
+ ORDERED_LIST = "orderedList"
24
+ LIST_ITEM = "listItem"
25
+ CODE_BLOCK = "codeBlock"
26
+ BLOCK_QUOTE = "blockquote"
27
+ PANEL = "panel"
28
+ TABLE = "table"
29
+ TABLE_ROW = "tableRow"
30
+ TABLE_CELL = "tableCell"
31
+ TABLE_HEADER = "tableHeader"
32
+ MEDIA_SINGLE = "mediaSingle"
33
+ MEDIA = "media"
34
+ MENTION = "mention"
35
+ EMOJI = "emoji"
36
+ HARD_BREAK = "hardBreak"
37
+ RULE = "rule"
38
+ INLINE_CARD = "inlineCard"
39
+ BLOCK_CARD = "blockCard"
40
+ EXPAND = "expand"
41
+ DECISION_LIST = "decisionList"
42
+ DECISION_ITEM = "decisionItem"
43
+ TASK_LIST = "taskList"
44
+ TASK_ITEM = "taskItem"
45
+ EXTENSION = "extension"
46
+ BODIED_EXTENSION = "bodiedExtension"
47
+ UNKNOWN = "unknown"
48
+
49
+ @classmethod
50
+ def from_string(cls, value: str) -> "AdfNodeType":
51
+ """Convert string to AdfNodeType."""
52
+ try:
53
+ return cls(value)
54
+ except ValueError:
55
+ return cls.UNKNOWN
56
+
57
+
58
+ class MarkType(Enum):
59
+ """ADF mark types."""
60
+
61
+ STRONG = "strong"
62
+ EM = "em"
63
+ CODE = "code"
64
+ STRIKE = "strike"
65
+ UNDERLINE = "underline"
66
+ SUBSUP = "subsup"
67
+ LINK = "link"
68
+ TEXT_COLOR = "textColor"
69
+ BACKGROUND_COLOR = "backgroundColor"
70
+ ALIGNMENT = "alignment"
71
+ INDENTATION = "indentation"
72
+ UNKNOWN = "unknown"
73
+
74
+ @classmethod
75
+ def from_string(cls, value: str) -> "MarkType":
76
+ """Convert string to MarkType."""
77
+ try:
78
+ return cls(value)
79
+ except ValueError:
80
+ return cls.UNKNOWN
81
+
82
+
83
+ @dataclass
84
+ class AdfMark:
85
+ """Represents an ADF mark (text formatting)."""
86
+
87
+ type: MarkType
88
+ attrs: Dict[str, Any] = field(default_factory=dict)
89
+
90
+
91
+ @dataclass
92
+ class AdfNode:
93
+ """Represents an ADF node in the document tree."""
94
+
95
+ type: AdfNodeType
96
+ attrs: Dict[str, Any] = field(default_factory=dict)
97
+ content: List["AdfNode"] = field(default_factory=list)
98
+ marks: List[AdfMark] = field(default_factory=list)
99
+ text: Optional[str] = None
100
+
101
+ def get_text_content(self) -> str:
102
+ """Extract all text content from this node and its children."""
103
+ if self.text:
104
+ return self.text
105
+
106
+ text_parts = []
107
+ for child in self.content:
108
+ text_parts.append(child.get_text_content())
109
+
110
+ return "".join(text_parts)
111
+
112
+ def find_nodes_by_type(self, node_type: AdfNodeType) -> List["AdfNode"]:
113
+ """Find all nodes of a specific type in the tree."""
114
+ results = []
115
+
116
+ if self.type == node_type:
117
+ results.append(self)
118
+
119
+ for child in self.content:
120
+ results.extend(child.find_nodes_by_type(node_type))
121
+
122
+ return results
123
+
124
+ def count_nodes_by_type(self) -> Dict[AdfNodeType, int]:
125
+ """Count nodes by type in the tree."""
126
+ counts: Dict[AdfNodeType, int] = {}
127
+
128
+ def count_recursive(node: AdfNode) -> None:
129
+ counts[node.type] = counts.get(node.type, 0) + 1
130
+ for child in node.content:
131
+ count_recursive(child)
132
+
133
+ count_recursive(self)
134
+ return counts
135
+
136
+ def to_dict(self) -> Dict[str, Any]:
137
+ """Convert node to dictionary representation."""
138
+ result: Dict[str, Any] = {"type": self.type.value}
139
+
140
+ if self.attrs:
141
+ result["attrs"] = self.attrs
142
+
143
+ if self.content:
144
+ result["content"] = [child.to_dict() for child in self.content]
145
+
146
+ if self.marks:
147
+ result["marks"] = [
148
+ {"type": mark.type.value, "attrs": mark.attrs} for mark in self.marks
149
+ ]
150
+
151
+ if self.text is not None:
152
+ result["text"] = self.text
153
+
154
+ return result
155
+
156
+
157
+ @dataclass
158
+ class AdfDocument:
159
+ """Represents a complete ADF document."""
160
+
161
+ version: int
162
+ root: AdfNode
163
+
164
+ def get_all_text(self) -> str:
165
+ """Get all text content from the document."""
166
+ return self.root.get_text_content()
167
+
168
+ def get_node_statistics(self) -> Dict[str, int]:
169
+ """Get statistics about node types in the document."""
170
+ counts = self.root.count_nodes_by_type()
171
+ return {node_type.value: count for node_type, count in counts.items()}
172
+
173
+ def find_links(self) -> List[Dict[str, Any]]:
174
+ """Find all links in the document."""
175
+ links = []
176
+
177
+ def find_link_marks(node: AdfNode) -> None:
178
+ for mark in node.marks:
179
+ if mark.type == MarkType.LINK:
180
+ links.append(
181
+ {
182
+ "href": mark.attrs.get("href"),
183
+ "text": node.get_text_content(),
184
+ "title": mark.attrs.get("title"),
185
+ }
186
+ )
187
+
188
+ for child in node.content:
189
+ find_link_marks(child)
190
+
191
+ find_link_marks(self.root)
192
+ return links
193
+
194
+ def find_mentions(self) -> List[Dict[str, Any]]:
195
+ """Find all user mentions in the document."""
196
+ mention_nodes = self.root.find_nodes_by_type(AdfNodeType.MENTION)
197
+ return [
198
+ {
199
+ "id": node.attrs.get("id"),
200
+ "text": node.attrs.get("text"),
201
+ "access_level": node.attrs.get("accessLevel"),
202
+ }
203
+ for node in mention_nodes
204
+ ]
205
+
206
+ def find_media(self) -> List[Dict[str, Any]]:
207
+ """Find all media (images, attachments) in the document."""
208
+ media_nodes = self.root.find_nodes_by_type(AdfNodeType.MEDIA)
209
+ return [
210
+ {
211
+ "id": node.attrs.get("id"),
212
+ "type": node.attrs.get("type"),
213
+ "collection": node.attrs.get("collection"),
214
+ "alt": node.attrs.get("alt"),
215
+ "width": node.attrs.get("width"),
216
+ "height": node.attrs.get("height"),
217
+ }
218
+ for node in media_nodes
219
+ ]
220
+
221
+ def to_dict(self) -> Dict[str, Any]:
222
+ """Convert document to dictionary representation."""
223
+ return {
224
+ "version": self.version,
225
+ "type": "doc",
226
+ "content": [child.to_dict() for child in self.root.content],
227
+ }
228
+
229
+
230
+ class AdfParser:
231
+ """
232
+ Parser for ADF (Atlassian Document Format) documents.
233
+
234
+ This class provides methods to parse ADF JSON documents into a structured
235
+ tree representation for analysis and comparison.
236
+ """
237
+
238
+ def parse(self, adf_data: Dict[str, Any]) -> AdfDocument:
239
+ """
240
+ Parse ADF JSON data into an AdfDocument.
241
+
242
+ Args:
243
+ adf_data: ADF document as a dictionary
244
+
245
+ Returns:
246
+ Parsed ADF document
247
+
248
+ Raises:
249
+ ValueError: If the ADF data is invalid
250
+ """
251
+ if not isinstance(adf_data, dict):
252
+ raise ValueError("ADF data must be a dictionary")
253
+
254
+ version = adf_data.get("version", 1)
255
+ doc_type = adf_data.get("type", "doc")
256
+
257
+ if doc_type != "doc":
258
+ raise ValueError(f"Expected document type 'doc', got '{doc_type}'")
259
+
260
+ root_content = adf_data.get("content", [])
261
+ root = AdfNode(
262
+ type=AdfNodeType.DOC,
263
+ content=[self._parse_node(node) for node in root_content],
264
+ )
265
+
266
+ return AdfDocument(version=version, root=root)
267
+
268
+ def parse_file(self, file_path: Path) -> AdfDocument:
269
+ """
270
+ Parse an ADF JSON file.
271
+
272
+ Args:
273
+ file_path: Path to the ADF JSON file
274
+
275
+ Returns:
276
+ Parsed ADF document
277
+ """
278
+ with open(file_path, 'r', encoding='utf-8') as f:
279
+ adf_data = json.load(f)
280
+
281
+ return self.parse(adf_data)
282
+
283
+ def _parse_node(self, node_data: Dict[str, Any]) -> AdfNode:
284
+ """Parse a single ADF node."""
285
+ if not isinstance(node_data, dict):
286
+ raise ValueError(f"Node data must be a dictionary, got {type(node_data)}")
287
+
288
+ node_type = AdfNodeType.from_string(node_data.get("type", "unknown"))
289
+ attrs = node_data.get("attrs", {})
290
+ text = node_data.get("text")
291
+
292
+ # Parse marks
293
+ marks = []
294
+ for mark_data in node_data.get("marks", []):
295
+ mark_type = MarkType.from_string(mark_data.get("type", "unknown"))
296
+ mark_attrs = mark_data.get("attrs", {})
297
+ marks.append(AdfMark(type=mark_type, attrs=mark_attrs))
298
+
299
+ # Parse content
300
+ content = []
301
+ for child_data in node_data.get("content", []):
302
+ content.append(self._parse_node(child_data))
303
+
304
+ return AdfNode(
305
+ type=node_type,
306
+ attrs=attrs,
307
+ content=content,
308
+ marks=marks,
309
+ text=text,
310
+ )
311
+
312
+
313
+ @dataclass
314
+ class AdfAnalysisReport:
315
+ """Report from analyzing an ADF document."""
316
+
317
+ node_count: int
318
+ node_statistics: Dict[str, int]
319
+ total_text_length: int
320
+ link_count: int
321
+ links: List[Dict[str, Any]]
322
+ mention_count: int
323
+ mentions: List[Dict[str, Any]]
324
+ media_count: int
325
+ media: List[Dict[str, Any]]
326
+ unknown_node_types: Set[str] = field(default_factory=set)
327
+ unknown_mark_types: Set[str] = field(default_factory=set)
328
+
329
+ def to_dict(self) -> Dict[str, Any]:
330
+ """Convert report to dictionary."""
331
+ return {
332
+ "node_count": self.node_count,
333
+ "node_statistics": self.node_statistics,
334
+ "total_text_length": self.total_text_length,
335
+ "link_count": self.link_count,
336
+ "links": self.links,
337
+ "mention_count": self.mention_count,
338
+ "mentions": self.mentions,
339
+ "media_count": self.media_count,
340
+ "media": self.media,
341
+ "unknown_node_types": list(self.unknown_node_types),
342
+ "unknown_mark_types": list(self.unknown_mark_types),
343
+ }
344
+
345
+
346
+ class AdfAnalyzer:
347
+ """
348
+ Analyzer for ADF documents.
349
+
350
+ Provides methods to analyze and report on ADF document structure and content.
351
+ """
352
+
353
+ def analyze(self, document: AdfDocument) -> AdfAnalysisReport:
354
+ """
355
+ Analyze an ADF document and generate a report.
356
+
357
+ Args:
358
+ document: ADF document to analyze
359
+
360
+ Returns:
361
+ Analysis report
362
+ """
363
+ node_statistics = document.get_node_statistics()
364
+ node_count = sum(node_statistics.values())
365
+ total_text = document.get_all_text()
366
+ links = document.find_links()
367
+ mentions = document.find_mentions()
368
+ media = document.find_media()
369
+
370
+ # Find unknown types
371
+ unknown_nodes = set()
372
+ unknown_marks = set()
373
+
374
+ def find_unknowns(node: AdfNode) -> None:
375
+ if node.type == AdfNodeType.UNKNOWN:
376
+ # Try to get original type string from dict representation
377
+ unknown_nodes.add(str(node.attrs.get("__original_type", "unknown")))
378
+
379
+ for mark in node.marks:
380
+ if mark.type == MarkType.UNKNOWN:
381
+ unknown_marks.add(str(mark.attrs.get("__original_type", "unknown")))
382
+
383
+ for child in node.content:
384
+ find_unknowns(child)
385
+
386
+ find_unknowns(document.root)
387
+
388
+ return AdfAnalysisReport(
389
+ node_count=node_count,
390
+ node_statistics=node_statistics,
391
+ total_text_length=len(total_text),
392
+ link_count=len(links),
393
+ links=links,
394
+ mention_count=len(mentions),
395
+ mentions=mentions,
396
+ media_count=len(media),
397
+ media=media,
398
+ unknown_node_types=unknown_nodes,
399
+ unknown_mark_types=unknown_marks,
400
+ )
@@ -0,0 +1,161 @@
1
+ """
2
+ Validators for ADF content.
3
+ """
4
+
5
+ from typing import Any, Dict, List
6
+
7
+
8
+ class AdfValidator:
9
+ """
10
+ Validator for Atlassian Document Format (ADF) content.
11
+ """
12
+
13
+ @staticmethod
14
+ def validate_document(doc: Dict[str, Any]) -> List[str]:
15
+ """
16
+ Validate an ADF document.
17
+
18
+ Args:
19
+ doc: ADF document to validate
20
+
21
+ Returns:
22
+ List of validation error messages, empty if valid
23
+ """
24
+ errors = []
25
+
26
+ # Check version
27
+ if doc.get("version") != 1:
28
+ errors.append("Invalid document version (must be 1)")
29
+
30
+ # Check type
31
+ if doc.get("type") != "doc":
32
+ errors.append("Invalid document type (must be 'doc')")
33
+
34
+ # Check content
35
+ content = doc.get("content", [])
36
+ if not isinstance(content, list):
37
+ errors.append("Document content must be a list")
38
+ else:
39
+ for i, node in enumerate(content):
40
+ node_errors = AdfValidator.validate_node(node)
41
+ for error in node_errors:
42
+ errors.append(f"Content node {i}: {error}")
43
+
44
+ return errors
45
+
46
+ @staticmethod
47
+ def validate_node(node: Dict[str, Any]) -> List[str]:
48
+ """
49
+ Validate an ADF node.
50
+
51
+ Args:
52
+ node: ADF node to validate
53
+
54
+ Returns:
55
+ List of validation error messages, empty if valid
56
+ """
57
+ errors = []
58
+
59
+ # Check that node is a dict
60
+ if not isinstance(node, dict):
61
+ return ["Node must be a dictionary"]
62
+
63
+ # Check type
64
+ node_type = node.get("type")
65
+ if not node_type:
66
+ errors.append("Node must have a type")
67
+
68
+ # Validate specific node types
69
+ if node_type == "text":
70
+ if "text" not in node:
71
+ errors.append("Text node must have 'text' content")
72
+
73
+ marks = node.get("marks", [])
74
+ if not isinstance(marks, list):
75
+ errors.append("Text marks must be a list")
76
+
77
+ elif node_type in (
78
+ "paragraph",
79
+ "heading",
80
+ "bulletList",
81
+ "orderedList",
82
+ "listItem",
83
+ "blockquote",
84
+ "panel",
85
+ ):
86
+ # Check content
87
+ content = node.get("content", [])
88
+ if not isinstance(content, list):
89
+ errors.append(f"{node_type} content must be a list")
90
+ else:
91
+ for i, child in enumerate(content):
92
+ child_errors = AdfValidator.validate_node(child)
93
+ for error in child_errors:
94
+ errors.append(f"Child node {i}: {error}")
95
+
96
+ # Check heading level
97
+ if node_type == "heading" and "level" in node:
98
+ level = node.get("level")
99
+ if not isinstance(level, int) or level < 1 or level > 6:
100
+ errors.append("Heading level must be an integer between 1 and 6")
101
+
102
+ elif node_type == "codeBlock":
103
+ # Check content
104
+ content = node.get("content", [])
105
+ if not isinstance(content, list):
106
+ errors.append("CodeBlock content must be a list")
107
+
108
+ elif node_type == "media":
109
+ # Check media attrs
110
+ attrs = node.get("attrs", {})
111
+ if not attrs.get("type"):
112
+ errors.append("Media node must have a type attribute")
113
+
114
+ if attrs.get("type") == "external" and not attrs.get("url"):
115
+ errors.append("External media must have a URL")
116
+
117
+ if attrs.get("type") == "file" and not attrs.get("id"):
118
+ errors.append("File media must have an ID")
119
+
120
+ elif node_type == "rule":
121
+ # Horizontal rule doesn't need additional validation
122
+ pass
123
+
124
+ return errors
125
+
126
+ @staticmethod
127
+ def validate_nested_content(node: Dict[str, Any], allowed_types: List[str]) -> List[str]:
128
+ """
129
+ Validate nested content of a node.
130
+
131
+ Args:
132
+ node: ADF node to validate
133
+ allowed_types: List of allowed child node types
134
+
135
+ Returns:
136
+ List of validation error messages, empty if valid
137
+ """
138
+ errors = []
139
+
140
+ content = node.get("content", [])
141
+ if not isinstance(content, list):
142
+ return [f"{node.get('type')} content must be a list"]
143
+
144
+ for i, child in enumerate(content):
145
+ if not isinstance(child, dict):
146
+ errors.append(f"Child node {i} must be a dictionary")
147
+ continue
148
+
149
+ child_type = child.get("type")
150
+ if not child_type:
151
+ errors.append(f"Child node {i} must have a type")
152
+ elif child_type not in allowed_types:
153
+ errors.append(
154
+ f"Child node {i} has invalid type: {child_type} (allowed: {', '.join(allowed_types)})"
155
+ )
156
+
157
+ child_errors = AdfValidator.validate_node(child)
158
+ for error in child_errors:
159
+ errors.append(f"Child node {i}: {error}")
160
+
161
+ return errors