mkdocs-confluence-plugin 1.27.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1677 @@
1
+ import time
2
+ import os
3
+ import hashlib
4
+ import sys
5
+ import re
6
+ import requests
7
+ import mimetypes
8
+ import mistune
9
+ import contextlib
10
+ import logging
11
+ from urllib.parse import quote
12
+ from pathlib import Path
13
+ import string
14
+ import mkdocs
15
+ from mkdocs.config import config_options
16
+ from mkdocs.plugins import BasePlugin
17
+ from mkdocs.structure.nav import Navigation
18
+ from mkdocs.structure.pages import Page
19
+ from md2cf.confluence_renderer import ConfluenceRenderer
20
+ from atlassian import Confluence
21
+ from urllib.parse import quote_plus
22
+ from typing import Optional
23
+ from difflib import get_close_matches
24
+
25
+ TEMPLATE_BODY = "<p> TEMPLATE </p>"
26
+ MKDOCS_FOOTER = "This page is auto-generated and will be overwritten at the next run."
27
+
28
+ log = logging.getLogger(__name__)
29
+ log.setLevel(logging.INFO)
30
+ formatter = logging.Formatter("mk2conflu [%(levelname)8s] : %(message)s")
31
+ stream_handler = logging.StreamHandler()
32
+ stream_handler.setFormatter(formatter)
33
+ log.addHandler(stream_handler)
34
+
35
+
36
+ @contextlib.contextmanager
37
+ def nostdout():
38
+ save_stdout = sys.stdout
39
+ sys.stdout = DummyFile()
40
+ yield
41
+ sys.stdout = save_stdout
42
+
43
+
44
+ class DummyFile:
45
+ def write(self, x):
46
+ pass
47
+
48
+
49
+ class ConfluencePlugin(BasePlugin):
50
+ config_scheme = (
51
+ ("host_url", config_options.Type(str, default=None)),
52
+ ("github_base_url", config_options.Type(str, default=None)),
53
+ ("space", config_options.Type(str, default=None)),
54
+ ("parent_page_name", config_options.Type(str, default=None)),
55
+ (
56
+ "username",
57
+ config_options.Type(str, default=os.environ.get("CONFLUENCE_USERNAME")),
58
+ ),
59
+ (
60
+ "password",
61
+ config_options.Type(str, default=os.environ.get("CONFLUENCE_PASSWORD")),
62
+ ),
63
+ ("enabled_if_env", config_options.Type(str, default=None)),
64
+ ("verbose", config_options.Type(bool, default=False)),
65
+ ("debug", config_options.Type(bool, default=False)),
66
+ ("dryrun", config_options.Type(bool, default=False)),
67
+ ("enable_footer", config_options.Type(bool, default=False)),
68
+ ("default_labels", config_options.Type(list, default=["cpe", "mkdocs"])),
69
+ )
70
+
71
+ def __init__(self):
72
+ self.page_lookup = {}
73
+ self.enabled = True
74
+ self.logger = log
75
+ self.confluence_renderer = ConfluenceRenderer(use_xhtml=True)
76
+ self.confluence_mistune = mistune.Markdown(renderer=self.confluence_renderer)
77
+ self.session = requests.Session()
78
+ self.pages = []
79
+ self.page_ids = {}
80
+ self.page_versions = {}
81
+ self.dryrun = False
82
+ self.tab_nav = []
83
+ self.attachments = {}
84
+ self.auth_configured = False
85
+ # Store attachments for deferred processing after all plugins have run
86
+ self.deferred_attachments = []
87
+
88
+ def normalize_title_key(self, title: str) -> str:
89
+ return re.sub(r"[^a-z0-9]+", "-", title.lower()).strip("-")
90
+
91
+ def extract_meaningful_words(self, text: str) -> set:
92
+ """Extract meaningful words from text, filtering out common prefixes and numbers."""
93
+ # Remove common prefixes and patterns, but be careful not to damage abbreviations
94
+ text = re.sub(
95
+ r"^(kb|rb)-", "", text.lower()
96
+ ) # Only remove kb- and rb- prefixes
97
+ text = re.sub(r"^docs?-", "", text) # Remove docs- prefix
98
+ text = re.sub(r"^\d{4}-?", "", text) # Remove leading numbers like "0001-"
99
+
100
+ # Handle common abbreviations and expand them
101
+ abbreviations = {
102
+ "adrs": [
103
+ "architecture",
104
+ "design",
105
+ "records",
106
+ "decision",
107
+ ], # Include both design and decision
108
+ "adr": ["architecture", "design", "record", "decision"],
109
+ "arch": ["architecture"],
110
+ "sso": ["single", "sign", "on"],
111
+ "auth": ["authentication", "authorization", "auth"],
112
+ "kb": ["knowledge", "base"],
113
+ "rb": ["runbook"],
114
+ "ci": ["continuous", "integration"],
115
+ "cd": ["continuous", "delivery"],
116
+ "cicd": ["continuous", "integration", "deployment", "delivery"],
117
+ "ci/cd": ["continuous", "integration", "deployment", "delivery"],
118
+ "aws": ["amazon", "web", "services"],
119
+ "api": ["application", "programming", "interface"],
120
+ "apis": ["application", "programming", "interface", "endpoints"],
121
+ "rest": ["representational", "state", "transfer"],
122
+ "ui": ["user", "interface"],
123
+ "db": ["database"],
124
+ "config": ["configuration"],
125
+ "admin": ["administration", "administrator"],
126
+ "mgmt": ["management"],
127
+ "ops": ["operations"],
128
+ "dev": ["development"],
129
+ "prod": ["production"],
130
+ "env": ["environment"],
131
+ "tech": ["technology"],
132
+ "deploy": ["deployment"],
133
+ "troubleshoot": ["troubleshooting"],
134
+ "setup": ["setup", "configuration"],
135
+ "guide": ["guide", "guidelines"],
136
+ }
137
+
138
+ # Split on various separators and filter out short/meaningless words
139
+ words = re.split(r"[-_\s\./]+", text)
140
+ meaningful_words = set()
141
+
142
+ # First, check if the whole text (lowercased) is an abbreviation
143
+ text_lower = text.lower()
144
+ if text_lower in abbreviations:
145
+ meaningful_words.update(abbreviations[text_lower])
146
+
147
+ for word in words:
148
+ word = word.strip().lower()
149
+ if len(word) > 2 and not word.isdigit():
150
+ # Check if word is an abbreviation
151
+ if word in abbreviations:
152
+ meaningful_words.update(abbreviations[word])
153
+ elif word not in {
154
+ "the",
155
+ "and",
156
+ "for",
157
+ "with",
158
+ "are",
159
+ "not",
160
+ "how",
161
+ "can",
162
+ "you",
163
+ "but",
164
+ "was",
165
+ }:
166
+ meaningful_words.add(word)
167
+ elif len(word) == 2 and word in abbreviations:
168
+ # Handle 2-letter abbreviations
169
+ meaningful_words.update(abbreviations[word])
170
+ elif len(word) >= 1 and word in abbreviations:
171
+ # Handle any length abbreviations
172
+ meaningful_words.update(abbreviations[word])
173
+
174
+ return meaningful_words
175
+
176
+ def calculate_word_similarity(self, text1: str, text2: str) -> float:
177
+ """Calculate similarity between two texts based on shared meaningful words."""
178
+ words1 = self.extract_meaningful_words(text1)
179
+ words2 = self.extract_meaningful_words(text2)
180
+
181
+ if not words1 or not words2:
182
+ return 0.0
183
+
184
+ intersection = words1.intersection(words2)
185
+ union = words1.union(words2)
186
+
187
+ return len(intersection) / len(union) if union else 0.0
188
+
189
+ def on_config(self, config):
190
+ plugin_cfg = self.config
191
+ self.space = self.config.get("space")
192
+ self.enabled = plugin_cfg.get("enabled", True)
193
+ self.only_in_nav = plugin_cfg.get("only_in_nav", False)
194
+
195
+ if not self.enabled:
196
+ return config
197
+
198
+ if not plugin_cfg.get("username"):
199
+ plugin_cfg["username"] = os.environ.get("CONFLUENCE_USERNAME")
200
+ if not plugin_cfg.get("password"):
201
+ plugin_cfg["password"] = os.environ.get("CONFLUENCE_PASSWORD")
202
+
203
+ required_keys = ["host_url", "username", "password", "space"]
204
+ missing_keys = [k for k in required_keys if not plugin_cfg.get(k)]
205
+ if missing_keys:
206
+ raise ValueError(f"Missing required config keys: {', '.join(missing_keys)}")
207
+
208
+ self.confluence = Confluence(
209
+ url=plugin_cfg["host_url"].replace("/rest/api/content", ""),
210
+ username=plugin_cfg["username"],
211
+ password=plugin_cfg["password"],
212
+ )
213
+
214
+ # Configure session for attachment uploads
215
+ self.session.auth = (plugin_cfg["username"], plugin_cfg["password"])
216
+ self.auth_configured = True
217
+
218
+ self.default_labels = plugin_cfg.get("default_labels", ["cpe", "mkdocs"])
219
+ self.dryrun = plugin_cfg.get("dryrun", False)
220
+
221
+ if plugin_cfg.get("debug", False):
222
+ log.setLevel(logging.DEBUG)
223
+
224
+ enabled_if_env = plugin_cfg.get("enabled_if_env")
225
+ if enabled_if_env:
226
+ self.enabled = os.environ.get(enabled_if_env) == "1"
227
+ if not self.enabled:
228
+ log.warning(
229
+ f"Exporting MKDOCS pages to Confluence turned OFF: set env var {enabled_if_env}=1 to enable."
230
+ )
231
+ return config
232
+ else:
233
+ log.info(
234
+ f"Exporting MKDOCS pages to Confluence turned ON (env var {enabled_if_env}=1)."
235
+ )
236
+ else:
237
+ log.info("Exporting MKDOCS pages to Confluence turned ON by default!")
238
+
239
+ if self.dryrun:
240
+ log.warning("DRYRUN MODE ENABLED: No changes will be made to Confluence.")
241
+
242
+ if plugin_cfg.get("parent_page_name"):
243
+ parent_parts = plugin_cfg["parent_page_name"].split("/")
244
+ current_parent_id = None
245
+
246
+ for part in parent_parts:
247
+ page_id = self.find_page_id(part, parent_id=current_parent_id)
248
+ if not page_id:
249
+ if self.dryrun:
250
+ log.warning(
251
+ f"DRYRUN: Would create missing intermediate page: {part}"
252
+ )
253
+ page_id = f"DUMMY_ID_{part}"
254
+ else:
255
+ log.warning(
256
+ f"Intermediate parent page '{part}' not found. Creating it..."
257
+ )
258
+ result = self.confluence.create_page(
259
+ space=plugin_cfg["space"],
260
+ title=part,
261
+ body=TEMPLATE_BODY,
262
+ parent_id=current_parent_id,
263
+ representation="storage",
264
+ )
265
+ if result and "id" in result:
266
+ page_id = result["id"]
267
+ self.page_ids[(part, current_parent_id)] = page_id
268
+ self.page_versions[(part, current_parent_id)] = 1
269
+ log.info(
270
+ f"Created intermediate parent page '{part}' with ID {page_id}"
271
+ )
272
+ else:
273
+ raise ValueError(
274
+ f"Failed to create intermediate parent page: {part}"
275
+ )
276
+
277
+ current_parent_id = page_id
278
+
279
+ self.parent_page_id = current_parent_id
280
+ log.info(
281
+ f"Using final root parent page ID {self.parent_page_id} for path '{plugin_cfg['parent_page_name']}'"
282
+ )
283
+
284
+ return config
285
+
286
+ def on_pre_build(self, config, **kwargs):
287
+ if not self.enabled:
288
+ return
289
+ log.info("🛠️ Pre-building Confluence folder hierarchy before content processing")
290
+ self.create_folder_structure_only(self.tab_nav, parent_id=self.parent_page_id)
291
+
292
+ def _normalize_parent_id(self, parent_id):
293
+ return str(parent_id) if parent_id else None
294
+
295
+ def _collect_all_page_names(self, nav_list):
296
+ result = []
297
+ # Handle the case where nav_list is a dict (for recursive calls)
298
+ if isinstance(nav_list, dict):
299
+ nav_list = [nav_list]
300
+
301
+ for item in nav_list:
302
+ if isinstance(item, dict):
303
+ for key, value in item.items():
304
+ result.append(key)
305
+ result.extend(self._collect_all_page_names(value))
306
+ else:
307
+ result.append(item)
308
+ return result
309
+
310
+ def create_folder_structure_only(self, nav_tree, parent_id=None):
311
+ for node in nav_tree:
312
+ if isinstance(node, str):
313
+ # Leaf node, nothing to do here
314
+ continue
315
+
316
+ if isinstance(node, dict):
317
+ for folder_title, children in node.items():
318
+ norm_title = folder_title.strip()
319
+ norm_key = (
320
+ self._normalize_title(norm_title),
321
+ str(parent_id) if parent_id else None,
322
+ )
323
+
324
+ # Skip if already created
325
+ if norm_key in self.page_ids:
326
+ folder_page_id = self.page_ids[norm_key]
327
+ log.debug(
328
+ f"Folder page '{norm_title}' already cached with ID {folder_page_id}"
329
+ )
330
+ else:
331
+ folder_page_id = self.find_page_id_or_global(
332
+ norm_title, parent_id=parent_id
333
+ )
334
+
335
+ if not folder_page_id:
336
+ if self.dryrun:
337
+ log.info(
338
+ f"DRYRUN: Would create folder page '{norm_title}' under parent ID {parent_id}"
339
+ )
340
+ else:
341
+ log.info(
342
+ f"Creating folder page '{norm_title}' under parent ID {parent_id}"
343
+ )
344
+ try:
345
+ result = self.confluence.create_page(
346
+ space=self.config["space"],
347
+ title=norm_title,
348
+ body="", # No body for folder pages
349
+ parent_id=parent_id,
350
+ representation="storage",
351
+ )
352
+ if result and "id" in result:
353
+ folder_page_id = result["id"]
354
+ self.page_ids[norm_key] = folder_page_id
355
+ self.page_versions[norm_key] = 1
356
+ log.info(
357
+ f"✅ Created folder page '{norm_title}' with ID {folder_page_id}"
358
+ )
359
+ else:
360
+ log.warning(
361
+ f"Failed to create folder page '{norm_title}': No ID returned"
362
+ )
363
+ except Exception as e:
364
+ log.error(
365
+ f"❌ Failed to create folder page '{norm_title}': {e}"
366
+ )
367
+ folder_page_id = None
368
+ else:
369
+ log.error(
370
+ f"❌ Failed to create folder page '{norm_title}'"
371
+ )
372
+ continue
373
+ else:
374
+ self.page_ids[norm_key] = folder_page_id
375
+ self.page_versions[norm_key] = 1
376
+ log.debug(
377
+ f"Found existing folder page '{norm_title}' with ID {folder_page_id}"
378
+ )
379
+
380
+ # ✅ Recurse into children
381
+ self.create_folder_structure_only(
382
+ children, parent_id=folder_page_id
383
+ )
384
+
385
+ def clear_cached_page_info(self):
386
+ self.page_ids.clear()
387
+ self.page_versions.clear()
388
+
389
+ def on_nav(self, nav: Navigation, config, files):
390
+ def add_to_tree(tree, parts):
391
+ part = parts[0].replace("_", " ").title()
392
+ if len(parts) == 1:
393
+ tree.setdefault(part, None)
394
+ else:
395
+ subtree = tree.setdefault(part, {})
396
+ add_to_tree(subtree, parts[1:])
397
+
398
+ tree = {}
399
+ for file in files.documentation_pages():
400
+ parts = file.src_path.split(os.sep)
401
+ if parts[-1].endswith(".md"):
402
+ parts[-1] = parts[-1][:-3]
403
+ add_to_tree(tree, parts)
404
+
405
+ def flatten_tree(t):
406
+ result = []
407
+ for key, value in sorted(t.items()):
408
+ if value is None:
409
+ result.append(key)
410
+ else:
411
+ result.append({key: flatten_tree(value)})
412
+ return result
413
+
414
+ nav_structure = flatten_tree(tree)
415
+ self.tab_nav = nav_structure # Nested nav structure
416
+
417
+ # Build parent-child mapping from nav
418
+ self.page_parents = self._flatten_nav_with_parents(self.tab_nav)
419
+
420
+ log.info(f"Auto-generated nested nav: {nav_structure}")
421
+
422
+ def _flatten_nav_with_parents(self, nav, parent=None):
423
+ result = {}
424
+ # Handle the case where nav is a dict (for recursive calls)
425
+ if isinstance(nav, dict):
426
+ nav = [nav]
427
+
428
+ for item in nav:
429
+ if isinstance(item, str):
430
+ result[item] = parent
431
+ elif isinstance(item, dict):
432
+ for k, v in item.items():
433
+ result[k] = parent
434
+ result.update(self._flatten_nav_with_parents(v, parent=k))
435
+ return result
436
+
437
+ def _build_page_path(self, title):
438
+ path = [title]
439
+ parent = self.page_parents.get(title)
440
+ while parent:
441
+ path.insert(0, parent)
442
+ parent = self.page_parents.get(parent)
443
+ return " / ".join(path)
444
+
445
+ def on_page_markdown(self, markdown, page, config, files):
446
+ """Capture page content before it's rendered and store by normalized title."""
447
+ abs_src_path = page.file.abs_src_path
448
+ title_key = self.normalize_title_key(page.title)
449
+ rendered = self.confluence_mistune(markdown)
450
+
451
+ page_info = {
452
+ "title": page.title,
453
+ "body": rendered,
454
+ "abs_src_path": abs_src_path,
455
+ "meta": page.meta,
456
+ "url": page.canonical_url,
457
+ }
458
+
459
+ # Store under page title key
460
+ self.page_lookup[title_key] = page_info
461
+
462
+ # Create a reverse lookup from normalized title to page info for fuzzy matching
463
+ if not hasattr(self, "title_to_page"):
464
+ self.title_to_page = {}
465
+ self.title_to_page[title_key] = page_info
466
+
467
+ # Also store under file path key for navigation matching
468
+ if abs_src_path:
469
+ rel_path = os.path.relpath(abs_src_path, "docs").replace("\\", "/")
470
+ # Remove .md extension for the path
471
+ if rel_path.endswith(".md"):
472
+ rel_path = rel_path[:-3]
473
+ path_key = self.normalize_title_key(rel_path)
474
+ self.page_lookup[path_key] = page_info
475
+
476
+ # Also store under just the filename (without directory)
477
+ filename = os.path.basename(rel_path)
478
+ filename_key = self.normalize_title_key(filename)
479
+ self.page_lookup[filename_key] = page_info
480
+
481
+ self.logger.debug(
482
+ f"📥 Cached page content under key '{title_key}' from '{abs_src_path}'"
483
+ )
484
+ if abs_src_path:
485
+ self.logger.debug(
486
+ f"📥 Also cached under path key '{path_key}' and filename key '{filename_key}'"
487
+ )
488
+ return markdown # Let MkDocs proceed as usual
489
+
490
+ def on_page_content(self, html, page, config, files):
491
+ """Process page content and add footer if enabled."""
492
+ log.debug("🧪 on_page_content called")
493
+
494
+ if not self.config.get("enable_footer"):
495
+ log.debug("🚫 Footer disabled")
496
+ return html
497
+
498
+ github_base_url = self.config.get("github_base_url")
499
+ if not github_base_url:
500
+ log.warning("⚠️ Missing github_base_url - footer cannot be generated")
501
+ return html
502
+
503
+ if not hasattr(page.file, "src_uri"):
504
+ log.warning("❌ No src_uri on page.file - footer cannot be generated")
505
+ return html
506
+
507
+ footer = f'<p><em><a href="{github_base_url}/{page.file.src_uri}">View source on GitHub</a></em></p>'
508
+ log.debug(f"✅ Adding footer: {footer}")
509
+
510
+ # Store the footer in page_lookup for later use in Confluence
511
+ title_key = self.normalize_title_key(page.title)
512
+ if title_key in self.page_lookup:
513
+ self.page_lookup[title_key]["footer"] = footer
514
+
515
+ return html + footer
516
+
517
+ def debug_dump_page_parents(self):
518
+ print("🔍 Page parent mapping:")
519
+ for child, parent in self.page_parents.items():
520
+ print(f" {child} ← {parent}")
521
+
522
+ def on_post_build(self, config, **kwargs):
523
+ if not self.enabled:
524
+ log.info("Confluence plugin disabled; skipping post-build.")
525
+ return
526
+
527
+ log.info(f"🔁 Nav structure for folder pages creation:\n{self.tab_nav}")
528
+ self.debug_dump_pages()
529
+
530
+ # 💡 Optional: Dump the page_lookup keys for debugging
531
+ log.debug(f"📄 Keys in page_lookup: {list(self.page_lookup.keys())}")
532
+
533
+ # 🧩 Populate self.pages based on page_lookup
534
+ self.pages = list(self.page_lookup.values())
535
+
536
+ log.info(f"📄 Total pages defined in MkDocs: {len(self.pages)}")
537
+
538
+ published_titles = [
539
+ self._normalize_title(p["title"]) for p in self.pages if p.get("content")
540
+ ]
541
+ all_nav_titles = [
542
+ self._normalize_title(n) for n in self._collect_all_page_names(self.tab_nav)
543
+ ]
544
+
545
+ missing = set(published_titles) - set(all_nav_titles)
546
+ if missing:
547
+ log.warning(
548
+ f"🚨 These pages have content but were not matched in nav: {missing}"
549
+ )
550
+
551
+ # ✅ Publish content pages via structured tree
552
+ self.build_and_publish_tree(self.tab_nav, self.parent_page_id)
553
+
554
+ # 🔗 Process all deferred attachments after all pages are created
555
+ if self.deferred_attachments:
556
+ log.info(
557
+ f"🔗 Processing {len(self.deferred_attachments)} deferred attachment collections after all plugins have finished"
558
+ )
559
+
560
+ for i, attachment_info in enumerate(self.deferred_attachments, 1):
561
+ page_id = attachment_info["page_id"]
562
+ page_title = attachment_info["page_title"]
563
+ src_path = attachment_info["src_path"]
564
+ original_content = attachment_info["original_content"]
565
+
566
+ log.debug(
567
+ f"Processing deferred attachments {i}/{len(self.deferred_attachments)} for page '{page_title}' (ID: {page_id})"
568
+ )
569
+
570
+ # Try to collect attachments from original content first (before PlantUML processing)
571
+ attachments = []
572
+ if original_content:
573
+ log.debug(
574
+ f"Attempting to collect attachments from original content"
575
+ )
576
+ attachments = self.collect_page_attachments(
577
+ src_path, original_content
578
+ )
579
+
580
+ # If no attachments found in original content, check if files exist anyway
581
+ # (PlantUML might have generated them and we can detect them by file existence)
582
+ if not attachments:
583
+ log.debug(
584
+ f"No attachments found in original content, checking for generated files..."
585
+ )
586
+ # Re-read the current file content to see what PlantUML might have generated
587
+ if src_path and Path(src_path).exists():
588
+ current_content = Path(src_path).read_text()
589
+ attachments = self.collect_page_attachments(
590
+ src_path, current_content
591
+ )
592
+
593
+ if attachments:
594
+ log.debug(
595
+ f"Found {len(attachments)} attachments for page '{page_title}'"
596
+ )
597
+ for j, attachment in enumerate(attachments, 1):
598
+ try:
599
+ file_size = attachment.stat().st_size
600
+ log.debug(
601
+ f" Attachment {j}: {attachment.name} ({file_size} bytes) - {attachment}"
602
+ )
603
+ except Exception as e:
604
+ log.debug(
605
+ f" Attachment {j}: {attachment.name} - Could not get file size: {e}"
606
+ )
607
+
608
+ if not self.dryrun:
609
+ self.sync_page_attachments(page_id, attachments)
610
+ else:
611
+ log.info(
612
+ f"DRYRUN: Would sync {len(attachments)} attachments for page '{page_title}'"
613
+ )
614
+ else:
615
+ log.debug(f"No attachments found for page '{page_title}'")
616
+
617
+ log.info(f"✅ Completed processing all deferred attachments")
618
+ else:
619
+ log.debug("No deferred attachments to process")
620
+
621
+ def get_page_url(self, title, parent_id=None):
622
+ cache_key = self._cache_key(title, parent_id)
623
+ page_id = self.page_ids.get(cache_key)
624
+ if not page_id:
625
+ page_id = self.find_page_id(title, parent_id)
626
+ if page_id:
627
+ return f"{self.config['host_url'].rstrip('/')}/pages/viewpage.action?pageId={page_id}"
628
+ return None
629
+
630
+ def page_exists(self, title, parent_id=None):
631
+ page_id = self.find_page_id(title, parent_id)
632
+ return (page_id is not None, page_id)
633
+
634
+ def _normalize_title(self, title: str) -> str:
635
+ """
636
+ Normalize title by lowercasing, removing punctuation, and stripping whitespace.
637
+ Preserves letters and digits, removes spaces and all punctuation characters.
638
+ """
639
+ title = title.strip().lower()
640
+ return title.translate(str.maketrans("", "", string.punctuation)).replace(
641
+ " ", ""
642
+ )
643
+
644
+ def apply_labels_to_page(self, page_id, labels=None, page_meta=None):
645
+ """Apply labels to a Confluence page."""
646
+ all_labels = []
647
+
648
+ # Add default labels
649
+ default_labels = getattr(self, "default_labels", [])
650
+ if default_labels:
651
+ all_labels.extend(default_labels)
652
+
653
+ # Add labels from page metadata
654
+ if page_meta:
655
+ page_labels = page_meta.get("labels", []) or page_meta.get("tags", [])
656
+ if page_labels:
657
+ # Ensure labels are strings and clean them
658
+ clean_page_labels = [
659
+ str(label).strip() for label in page_labels if label
660
+ ]
661
+ all_labels.extend(clean_page_labels)
662
+
663
+ # Add any explicitly passed labels
664
+ if labels:
665
+ all_labels.extend(labels)
666
+
667
+ # Remove duplicates while preserving order
668
+ unique_labels = []
669
+ seen = set()
670
+ for label in all_labels:
671
+ if label not in seen:
672
+ unique_labels.append(label)
673
+ seen.add(label)
674
+
675
+ if not unique_labels:
676
+ log.debug(f"📝 No labels to apply to page ID {page_id}")
677
+ return
678
+
679
+ if self.dryrun:
680
+ log.info(f"DRYRUN: Would apply labels {unique_labels} to page ID {page_id}")
681
+ return
682
+
683
+ try:
684
+ # Get current labels to avoid duplicates
685
+ current_labels = self.confluence.get_page_labels(page_id)
686
+ current_label_names = [
687
+ label["name"] for label in current_labels.get("results", [])
688
+ ]
689
+
690
+ # Only add labels that don't already exist
691
+ new_labels = [
692
+ label for label in unique_labels if label not in current_label_names
693
+ ]
694
+
695
+ if new_labels:
696
+ for label in new_labels:
697
+ self.confluence.set_page_label(page_id, label)
698
+ log.debug(f"✅ Applied labels {new_labels} to page ID {page_id}")
699
+ else:
700
+ log.debug(
701
+ f"📝 All labels {unique_labels} already exist on page ID {page_id}"
702
+ )
703
+
704
+ except Exception as e:
705
+ log.error(
706
+ f"❌ Failed to apply labels {unique_labels} to page ID {page_id}: {e}"
707
+ )
708
+
709
+ def create_or_update_page(
710
+ self,
711
+ title,
712
+ body="",
713
+ parent_id=None,
714
+ is_folder=False,
715
+ attachments=None,
716
+ abs_src_path=None,
717
+ ):
718
+ """Create or update a Confluence page. Handles folders, dry run, and logging."""
719
+ if not title:
720
+ log.warning("⚠️ create_or_update_page: Missing title. Skipping.")
721
+ return None
722
+
723
+ key = self.normalize_title_key(title)
724
+ page_exists, existing_id = self.page_exists(title, parent_id)
725
+
726
+ # Get page info to check for footer and metadata
727
+ page_info = None
728
+ title_key = self.normalize_title_key(title)
729
+ page_info = self.page_lookup.get(title_key)
730
+
731
+ if not page_info:
732
+ # Try to find by title match
733
+ for lookup_key, info in self.page_lookup.items():
734
+ if info.get("title") == title:
735
+ page_info = info
736
+ break
737
+
738
+ # Add footer to body if it exists
739
+ final_body = body
740
+ if page_info and page_info.get("footer") and not is_folder:
741
+ final_body = body + page_info["footer"]
742
+
743
+ # Extract metadata for labels
744
+ page_meta = page_info.get("meta", {}) if page_info else {}
745
+
746
+ if page_exists:
747
+ page_id = existing_id
748
+ log.info(f"📝 Page exists: '{title}' (ID={page_id}) — updating.")
749
+ if not self.dryrun:
750
+ self.confluence.update_page(page_id, title, final_body)
751
+ # Apply labels to updated page (including page metadata labels)
752
+ if not is_folder:
753
+ self.apply_labels_to_page(page_id, page_meta=page_meta)
754
+ else:
755
+ self.dryrun_log("update", title, parent_id)
756
+ else:
757
+ log.info(f"🆕 Page does not exist: '{title}' — creating.")
758
+ if not self.dryrun:
759
+ created = self.confluence.create_page(
760
+ self.space, title, final_body, parent_id
761
+ )
762
+ page_id = created.get("id")
763
+ # Apply labels to newly created page (including page metadata labels)
764
+ if page_id and not is_folder:
765
+ self.apply_labels_to_page(page_id, page_meta=page_meta)
766
+ else:
767
+ page_id = f"DRYRUN-{title}"
768
+ self.dryrun_log("create", title, parent_id)
769
+
770
+ # Attachments handling - defer processing until after all plugins have run
771
+ if abs_src_path:
772
+ # Store the original markdown content before any plugins modify it
773
+ original_content = None
774
+ if abs_src_path and Path(abs_src_path).exists():
775
+ original_content = Path(abs_src_path).read_text()
776
+
777
+ # Store attachment info for deferred processing
778
+ attachment_info = {
779
+ "page_id": page_id,
780
+ "page_title": title,
781
+ "src_path": abs_src_path,
782
+ "original_content": original_content,
783
+ "processed_content": body,
784
+ }
785
+ self.deferred_attachments.append(attachment_info)
786
+ log.debug(
787
+ f"Deferred attachment processing for page '{title}' (ID: {page_id})"
788
+ )
789
+
790
+ self.page_ids[key] = page_id
791
+ return page_id
792
+
793
+ def create_page(self, title, body, parent_id, is_folder=False):
794
+ norm_title = self._normalize_title(title)
795
+ norm_parent_id = str(parent_id) if parent_id else None
796
+ cache_key = (norm_title, norm_parent_id)
797
+
798
+ if self.dryrun:
799
+ self.dryrun_log("create page", title, parent_id)
800
+ return f"DUMMY_ID_{title}"
801
+
802
+ # Get page metadata for labels
803
+ title_key = self.normalize_title_key(title)
804
+ page_info = self.page_lookup.get(title_key, {})
805
+ page_meta = page_info.get("meta", {})
806
+
807
+ try:
808
+ log.info(
809
+ f"📄 Attempting to create page '{title}' under parent ID {parent_id}"
810
+ )
811
+ # Use empty string for folder body, avoid TEMPLATE_BODY for child/content pages
812
+ body_to_use = "" if is_folder else (body or "")
813
+ result = self.confluence.create_page(
814
+ space=self.config["space"],
815
+ title=title,
816
+ body=body_to_use,
817
+ parent_id=parent_id,
818
+ representation="storage",
819
+ )
820
+ if result and "id" in result:
821
+ page_id = result["id"]
822
+ self.page_ids[cache_key] = page_id
823
+ self.page_versions[cache_key] = 1
824
+
825
+ # Apply labels to newly created page (including page metadata labels)
826
+ if not is_folder:
827
+ self.apply_labels_to_page(page_id, page_meta=page_meta)
828
+
829
+ log.info(
830
+ f"✅ Created {'folder' if is_folder else 'content'} page '{title}' with ID {page_id}"
831
+ )
832
+ return page_id
833
+ except Exception as e:
834
+ if "already exists with the same TITLE" in str(e):
835
+ log.warning(
836
+ f"⚠️ Page '{title}' already exists — attempting update instead"
837
+ )
838
+ else:
839
+ log.error(f"❌ Failed to create page '{title}': {e}", exc_info=True)
840
+ return None
841
+
842
+ # Fallback: update existing page if creation fails
843
+ page_id = self.find_page_id(title, parent_id)
844
+ if not page_id:
845
+ log.error(
846
+ f"❌ Cannot update '{title}': page ID not found after creation failure"
847
+ )
848
+ return None
849
+
850
+ prev_version = self.page_versions.get(cache_key, 1)
851
+ new_version = prev_version + 1
852
+
853
+ try:
854
+ log.info(
855
+ f"🔁 Updating page '{title}' (ID {page_id}) to version {new_version}"
856
+ )
857
+ self.confluence.update_page(
858
+ page_id=page_id,
859
+ title=title,
860
+ body="" if is_folder else (body or ""), # Folder pages get empty body
861
+ parent_id=parent_id,
862
+ type="page",
863
+ representation="storage",
864
+ minor_edit=False,
865
+ )
866
+ self.page_ids[cache_key] = page_id
867
+ self.page_versions[cache_key] = new_version
868
+
869
+ # Apply labels to updated page (including page metadata labels)
870
+ if not is_folder:
871
+ self.apply_labels_to_page(page_id, page_meta=page_meta)
872
+
873
+ log.info(f"✅ Updated page '{title}' (version {new_version})")
874
+ return page_id
875
+ except Exception as e:
876
+ log.error(
877
+ f"❌ Failed to update page '{title}' (ID {page_id}): {e}", exc_info=True
878
+ )
879
+ return None
880
+
881
+ def publish_page(self, page_title, body, parent_id, source_path=None, dryrun=False):
882
+ norm_title = self._normalize_title(page_title)
883
+ norm_parent_id = str(parent_id) if parent_id else None
884
+ cache_key = (norm_title, norm_parent_id)
885
+
886
+ if dryrun:
887
+ self.dryrun_log("publish page", page_title, parent_id)
888
+ return f"DUMMY_ID_{page_title}"
889
+
890
+ # Get page metadata for labels
891
+ title_key = self.normalize_title_key(page_title)
892
+ page_info = self.page_lookup.get(title_key, {})
893
+ page_meta = page_info.get("meta", {})
894
+
895
+ # Try to create page first
896
+ try:
897
+ log.info(f"📄 Creating page '{page_title}' under parent ID {parent_id}")
898
+ result = self.confluence.create_page(
899
+ space=self.config["space"],
900
+ title=page_title,
901
+ body=body or "",
902
+ parent_id=parent_id,
903
+ representation="storage",
904
+ )
905
+ if result and "id" in result:
906
+ page_id = result["id"]
907
+ self.page_ids[cache_key] = page_id
908
+ self.page_versions[cache_key] = 1
909
+
910
+ # Apply labels to newly created page
911
+ self.apply_labels_to_page(page_id, page_meta=page_meta)
912
+
913
+ log.info(f"✅ Created page '{page_title}' with ID {page_id}")
914
+ return page_id
915
+ except Exception as e:
916
+ if "already exists with the same TITLE" in str(e):
917
+ log.warning(f"⚠️ Page '{page_title}' already exists — attempting update")
918
+ else:
919
+ log.error(
920
+ f"❌ Failed to create page '{page_title}': {e}", exc_info=True
921
+ )
922
+ return None
923
+
924
+ # Fallback: Update existing page
925
+ page_id = self.find_page_id(page_title, parent_id)
926
+ if not page_id:
927
+ log.error(f"❌ Cannot update '{page_title}': page ID not found")
928
+ return None
929
+
930
+ prev_version = self.page_versions.get(cache_key, 1)
931
+ new_version = prev_version + 1
932
+ try:
933
+ log.info(
934
+ f"🔁 Updating page '{page_title}' (ID {page_id}) to version {new_version}"
935
+ )
936
+ self.confluence.update_page(
937
+ page_id=page_id,
938
+ title=page_title,
939
+ body=body or "",
940
+ parent_id=parent_id,
941
+ type="page",
942
+ representation="storage",
943
+ minor_edit=False,
944
+ )
945
+ self.page_ids[cache_key] = page_id
946
+ self.page_versions[cache_key] = new_version
947
+
948
+ # Apply labels to updated page
949
+ self.apply_labels_to_page(page_id, page_meta=page_meta)
950
+
951
+ log.info(f"✅ Updated page '{page_title}' (version {new_version})")
952
+ return page_id
953
+ except Exception as e:
954
+ log.error(
955
+ f"❌ Failed to update page '{page_title}' (ID {page_id}): {e}",
956
+ exc_info=True,
957
+ )
958
+ return None
959
+
960
+ def find_or_create_page(self, title, parent_id=None, is_folder=False):
961
+ norm_title = self._normalize_title(title)
962
+ norm_parent_id = str(parent_id) if parent_id is not None else None
963
+ cache_key = self._cache_key(title, norm_parent_id)
964
+
965
+ page_id = self.find_page_id(title, parent_id=parent_id)
966
+ if page_id:
967
+ return page_id
968
+
969
+ log.info(f"Creating Confluence page '{title}' under parent ID {parent_id}")
970
+ if self.dryrun:
971
+ self.dryrun_log("create", title, parent_id)
972
+ return f"DUMMY_ID_{title}"
973
+
974
+ result = self.confluence.create_page(
975
+ space=self.config["space"],
976
+ title=title,
977
+ body="" if is_folder else TEMPLATE_BODY,
978
+ parent_id=parent_id,
979
+ representation="storage",
980
+ )
981
+ if result and "id" in result:
982
+ page_id = result["id"]
983
+ self.page_ids[cache_key] = page_id
984
+ self.page_versions[cache_key] = 1
985
+ return page_id
986
+
987
+ log.error(f"Failed to create or find page '{title}'")
988
+ return None
989
+
990
+ def find_page_id(self, title: str, parent_id: str | None = None) -> str | None:
991
+ """
992
+ Find a Confluence page ID by its title and parent page ID.
993
+ If parent_id is None, search top-level pages in the space.
994
+
995
+ Returns page ID if found, else None.
996
+ """
997
+ # Normalize title for consistent lookup if needed (depends on your implementation)
998
+ normalized_title = title.strip().lower()
999
+
1000
+ # 1) Search children of parent page if parent_id provided
1001
+ if parent_id:
1002
+ children = self.confluence.get_page_child_by_type(parent_id, "page")
1003
+ for child in children:
1004
+ if child["title"].strip().lower() == normalized_title:
1005
+ return child["id"]
1006
+
1007
+ # 2) If no parent or not found above, search globally in space by title
1008
+ # Use Confluence CQL (Confluence Query Language) to search pages by title in the space
1009
+ cql = f'title="{title}" and space="{self.config["space"]}" and type="page"'
1010
+ search_result = self.confluence.cql(cql, limit=10)
1011
+ for result in search_result.get("results", []):
1012
+ page = result.get("content")
1013
+ if page and page.get("title", "").strip().lower() == normalized_title:
1014
+ return page.get("id")
1015
+
1016
+ # Not found
1017
+ return None
1018
+
1019
+ def find_page_id_global(self, title):
1020
+ cql = f'title = "{title}" and space = "{self.config["space"]}"'
1021
+ results = self.confluence.cql(cql)
1022
+ if results.get("results"):
1023
+ page = results["results"][0]
1024
+ page_id = page.get("id") or page.get("content", {}).get("id")
1025
+ version = page.get("version", {}).get("number", 1)
1026
+ log.debug(
1027
+ f"Found global page '{title}' with ID {page_id} (version {version})"
1028
+ )
1029
+ return page_id
1030
+ return None
1031
+
1032
+ def find_page_id_or_global(self, title, parent_id=None):
1033
+ norm_parent_id = self._normalize_parent_id(parent_id)
1034
+ norm_title = self._normalize_title(title)
1035
+ key = (norm_title, norm_parent_id)
1036
+
1037
+ if key in self.page_ids:
1038
+ return self.page_ids[key]
1039
+
1040
+ page_id = self.find_page_id(title, parent_id)
1041
+ if page_id:
1042
+ self.page_ids[key] = page_id
1043
+ return page_id
1044
+
1045
+ log.debug(
1046
+ f"Page '{title}' not found with parent ID {parent_id}, trying global lookup"
1047
+ )
1048
+ page_id = self.find_page_id_global(title)
1049
+ if page_id:
1050
+ self.page_ids[(norm_title, None)] = page_id
1051
+ return page_id
1052
+
1053
+ def collect_page_attachments(self, src_path, content):
1054
+ """Collect attachment files referenced in the markdown content."""
1055
+ import re
1056
+ from pathlib import Path
1057
+
1058
+ attachments = []
1059
+ if not src_path:
1060
+ log.debug("collect_page_attachments: No source path provided")
1061
+ return attachments
1062
+
1063
+ src_dir = Path(src_path).parent
1064
+ log.debug(f"Collecting attachments from {src_path} (source dir: {src_dir})")
1065
+
1066
+ # Find markdown image references: ![alt](path) and ![alt](path "title")
1067
+ img_pattern = r"!\[([^\]]*)\]\(([^)]+)\)"
1068
+ matches = re.findall(img_pattern, content)
1069
+ log.debug(
1070
+ f"Found {len(matches)} image references in markdown: {[match[1] for match in matches]}"
1071
+ )
1072
+
1073
+ for alt_text, img_path in matches:
1074
+ # Remove any quotes and title text
1075
+ img_path = img_path.split('"')[0].strip()
1076
+
1077
+ # Skip external URLs
1078
+ if img_path.startswith(("http://", "https://", "//")):
1079
+ continue
1080
+
1081
+ img_file = None
1082
+
1083
+ # Handle relative paths - try multiple resolution strategies
1084
+ if img_path.startswith("./"):
1085
+ # Remove ./ prefix
1086
+ img_path = img_path[2:]
1087
+ img_file = src_dir / img_path
1088
+ elif img_path.startswith("../"):
1089
+ # Handle parent directory references - try multiple strategies
1090
+
1091
+ # Strategy 1: Resolve relative to source file
1092
+ img_file = (src_dir / img_path).resolve()
1093
+
1094
+ # Strategy 2: If not found, try relative to docs root
1095
+ if not img_file.exists():
1096
+ # If the path goes up to project root, try prefixing with docs/
1097
+ if img_path.startswith("../../../"):
1098
+ # This likely goes to project root, so try docs/ prefix
1099
+ alt_path = img_path[9:] # Remove ../../../
1100
+ img_file = Path("docs") / alt_path
1101
+
1102
+ # Strategy 3: Try relative to project root
1103
+ if not img_file.exists() and img_path.startswith("../"):
1104
+ # Resolve from source directory and see if it makes sense
1105
+ try:
1106
+ project_relative = (src_dir / img_path).resolve()
1107
+ if project_relative.exists():
1108
+ img_file = project_relative
1109
+ except:
1110
+ pass
1111
+
1112
+ else:
1113
+ # Non-relative paths: try both relative to source file and relative to docs root
1114
+ img_file = src_dir / img_path
1115
+ if not img_file.exists():
1116
+ img_file = Path("docs") / img_path
1117
+
1118
+ # Check if file exists and is an image
1119
+ if (
1120
+ img_file
1121
+ and img_file.exists()
1122
+ and img_file.suffix.lower()
1123
+ in (".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf", ".webp")
1124
+ ):
1125
+ resolved_path = img_file.resolve()
1126
+ file_size = resolved_path.stat().st_size
1127
+ attachments.append(resolved_path)
1128
+ log.debug(
1129
+ f"✓ Found attachment: {img_file} ({file_size} bytes) from markdown reference: {img_path}"
1130
+ )
1131
+ else:
1132
+ log.warning(
1133
+ f"✗ Referenced image not found: {img_path} (resolved to {img_file})"
1134
+ )
1135
+
1136
+ return attachments
1137
+
1138
+ def sync_page_attachments(self, page_id, attachments):
1139
+ """Sync attachments for a page."""
1140
+ if not self.auth_configured:
1141
+ log.warning("Authentication not configured for attachment uploads")
1142
+ return
1143
+
1144
+ if not attachments:
1145
+ log.debug(f"No attachments to sync for page ID {page_id}")
1146
+ return
1147
+
1148
+ log.info(f"Syncing {len(attachments)} attachments for page ID {page_id}")
1149
+ for i, attachment_path in enumerate(attachments, 1):
1150
+ try:
1151
+ log.debug(
1152
+ f"Processing attachment {i}/{len(attachments)}: {attachment_path.name}"
1153
+ )
1154
+ self.add_or_update_attachment(page_id, attachment_path)
1155
+ except Exception as e:
1156
+ log.error(f"Failed to sync attachment {attachment_path}: {e}")
1157
+
1158
+ def add_or_update_attachment(self, page_id, filepath):
1159
+ """Add or update an attachment for a page."""
1160
+ if not self.auth_configured:
1161
+ log.warning("Authentication not configured for attachment uploads")
1162
+ return
1163
+
1164
+ try:
1165
+ file_size = filepath.stat().st_size
1166
+ log.info(
1167
+ f"Handling attachment: file '{filepath.name}' ({file_size} bytes) for page ID {page_id}"
1168
+ )
1169
+ except Exception as e:
1170
+ log.info(
1171
+ f"Handling attachment: file '{filepath.name}' (size unknown: {e}) for page ID {page_id}"
1172
+ )
1173
+
1174
+ if not page_id:
1175
+ log.error("Cannot upload attachment: Page ID is missing")
1176
+ return
1177
+
1178
+ try:
1179
+ file_hash = self.get_file_sha1(filepath)
1180
+ attachment_comment = f"ConfluencePlugin [v{file_hash}]"
1181
+ log.debug(f"Attachment '{filepath.name}' hash: {file_hash}")
1182
+
1183
+ existing_attachment = self.get_attachment(page_id, filepath)
1184
+ if existing_attachment:
1185
+ file_hash_regex = re.compile(r"\[v([a-f0-9]+)\]")
1186
+ current_hash_match = file_hash_regex.search(
1187
+ existing_attachment.get("metadata", {}).get("comment", "")
1188
+ )
1189
+ if current_hash_match and current_hash_match.group(1) == file_hash:
1190
+ log.info(
1191
+ f"Attachment '{filepath.name}' is up-to-date. Skipping upload."
1192
+ )
1193
+ return
1194
+ else:
1195
+ log.debug(
1196
+ f"Attachment '{filepath.name}' has changed (old hash: {current_hash_match.group(1) if current_hash_match else 'unknown'}, new hash: {file_hash})"
1197
+ )
1198
+ self.delete_attachment(existing_attachment["id"])
1199
+ log.info(f"Deleted outdated attachment '{filepath.name}'.")
1200
+ else:
1201
+ log.debug(
1202
+ f"No existing attachment found for '{filepath.name}', will upload new one"
1203
+ )
1204
+
1205
+ self.upload_attachment(page_id, filepath, attachment_comment)
1206
+ except Exception as e:
1207
+ log.error(f"Error handling attachment {filepath}: {e}")
1208
+
1209
+ def get_attachment(self, page_id, filepath):
1210
+ """Get existing attachment by page ID and filename."""
1211
+ try:
1212
+ # Use base URL without /rest/api/content since we add it below
1213
+ base_url = self.config["host_url"].replace("/rest/api/content", "")
1214
+ url = f"{base_url}/rest/api/content/{page_id}/child/attachment"
1215
+ params = {"filename": filepath.name}
1216
+ response = self.session.get(url, params=params)
1217
+ if response.status_code == 200:
1218
+ results = response.json().get("results", [])
1219
+ if results:
1220
+ return results[0]
1221
+ elif response.status_code != 404:
1222
+ log.warning(
1223
+ f"Failed to check existing attachment (status {response.status_code}): {response.text}"
1224
+ )
1225
+ except Exception as e:
1226
+ log.error(f"Error checking existing attachment: {e}")
1227
+ return None
1228
+
1229
+ def upload_attachment(self, page_id, filepath, comment):
1230
+ """Upload an attachment to a page."""
1231
+ try:
1232
+ file_size = filepath.stat().st_size
1233
+ log.debug(
1234
+ f"Starting upload of '{filepath.name}' ({file_size} bytes) to page ID {page_id}"
1235
+ )
1236
+
1237
+ # Use base URL without /rest/api/content since we add it below
1238
+ base_url = self.config["host_url"].replace("/rest/api/content", "")
1239
+ url = f"{base_url}/rest/api/content/{page_id}/child/attachment"
1240
+ log.debug(f"Upload URL: {url}")
1241
+
1242
+ # Set headers for Confluence Cloud API
1243
+ headers = {
1244
+ "X-Atlassian-Token": "no-check", # Disable XSRF check
1245
+ }
1246
+
1247
+ with open(filepath, "rb") as f:
1248
+ files = {
1249
+ "file": (filepath.name, f, mimetypes.guess_type(filepath.name)[0])
1250
+ }
1251
+ data = {"comment": comment}
1252
+ log.debug(f"Uploading file with comment: {comment}")
1253
+ response = self.session.post(
1254
+ url, files=files, data=data, headers=headers
1255
+ )
1256
+
1257
+ if response.status_code in (200, 201):
1258
+ log.info(
1259
+ f"✓ Successfully uploaded attachment '{filepath.name}' ({file_size} bytes) to page ID {page_id}"
1260
+ )
1261
+ log.debug(f"Upload response status: {response.status_code}")
1262
+ else:
1263
+ log.error(
1264
+ f"✗ Failed to upload attachment '{filepath.name}' (status {response.status_code}): {response.text}"
1265
+ )
1266
+ except Exception as e:
1267
+ log.error(f"✗ Error uploading attachment {filepath}: {e}")
1268
+
1269
+ def delete_attachment(self, attachment_id):
1270
+ """Delete an attachment by ID."""
1271
+ try:
1272
+ # Use base URL without /rest/api/content since we add it below
1273
+ base_url = self.config["host_url"].replace("/rest/api/content", "")
1274
+ url = f"{base_url}/rest/api/content/{attachment_id}"
1275
+ response = self.session.delete(url)
1276
+ if response.status_code == 204:
1277
+ log.info(f"Deleted attachment ID {attachment_id}.")
1278
+ else:
1279
+ log.error(
1280
+ f"Failed to delete attachment ID {attachment_id} (status {response.status_code}): {response.text}"
1281
+ )
1282
+ except Exception as e:
1283
+ log.error(f"Error deleting attachment {attachment_id}: {e}")
1284
+
1285
+ def debug_dump_pages(self):
1286
+ if not self.pages:
1287
+ log.warning("⚠️ debug_dump_pages: self.pages is empty.")
1288
+ return
1289
+
1290
+ log.info(f"📄 Debug dump of self.pages ({len(self.pages)} entries):")
1291
+ for idx, page in enumerate(self.pages, 1):
1292
+ title = page.get("title", "<no title>")
1293
+ parent_id = (
1294
+ str(page.get("parent_id"))
1295
+ if page.get("parent_id") is not None
1296
+ else "None"
1297
+ )
1298
+ body = page.get("body", "")
1299
+ is_folder = page.get("is_folder", False)
1300
+ body_preview = body[:60].replace("\n", " ") + (
1301
+ "..." if len(body) > 60 else ""
1302
+ )
1303
+ log.info(
1304
+ f" {idx:3}: Title='{title}', ParentID='{parent_id}' ({type(parent_id).__name__}), "
1305
+ f"IsFolder={is_folder}, BodyLen={len(body)}, BodyPreview='{body_preview}'"
1306
+ )
1307
+
1308
+ log.info("✅ End of debug dump.")
1309
+
1310
+ def build_and_publish_tree(
1311
+ self,
1312
+ nav_tree: list,
1313
+ parent_id: Optional[str] = None,
1314
+ path_stack: list = None,
1315
+ processed_pages: set = None,
1316
+ ):
1317
+ if path_stack is None:
1318
+ path_stack = []
1319
+
1320
+ # Initialize processed_pages set at the top level
1321
+ if processed_pages is None:
1322
+ processed_pages = set()
1323
+
1324
+ for node in nav_tree:
1325
+ if isinstance(node, str):
1326
+ path_stack_full = path_stack + [node]
1327
+ lookup_key = self.normalize_title_key("/".join(path_stack_full))
1328
+
1329
+ page_info = self.page_lookup.get(lookup_key)
1330
+
1331
+ # If not found, try fallback strategies
1332
+ if not page_info:
1333
+ # Strategy 1: Try just the node name
1334
+ fallback_key = self.normalize_title_key(node)
1335
+ page_info = self.page_lookup.get(fallback_key)
1336
+ if page_info:
1337
+ log.debug(
1338
+ f"✅ Found page using fallback key '{fallback_key}' for '{node}'"
1339
+ )
1340
+
1341
+ # Strategy 2: Try removing .md extension if present
1342
+ if not page_info and node.endswith(".md"):
1343
+ node_without_ext = node[:-3] # Remove .md
1344
+ ext_fallback_key = self.normalize_title_key(node_without_ext)
1345
+ page_info = self.page_lookup.get(ext_fallback_key)
1346
+ if page_info:
1347
+ log.debug(
1348
+ f"✅ Found page using extension-stripped key '{ext_fallback_key}' for '{node}'"
1349
+ )
1350
+
1351
+ # Strategy 3: Try title-based fuzzy matching first, then fallback to key matching
1352
+ if not page_info:
1353
+ # Convert navigation entry to clean format for comparison
1354
+ node_clean = (
1355
+ node.replace(".md", "").replace("-", " ").replace("_", " ")
1356
+ )
1357
+
1358
+ # Strategy 3a: Priority title matching - direct comparison with page titles
1359
+ best_match = None
1360
+ best_similarity = 0.0
1361
+
1362
+ # First pass: Look for title matches with high priority
1363
+ for key, page_data in self.page_lookup.items():
1364
+ page_title = page_data.get("title", "")
1365
+ if not page_title:
1366
+ continue
1367
+
1368
+ # Calculate similarity between navigation entry and page title
1369
+ title_similarity = self.calculate_word_similarity(
1370
+ node_clean, page_title
1371
+ )
1372
+
1373
+ # Bonus for context matching - check if the page path contains folder context
1374
+ context_bonus = 0.0
1375
+ if len(path_stack) > 0:
1376
+ # Check if any words from the path stack appear in the page key or title
1377
+ path_context = (
1378
+ " ".join(path_stack)
1379
+ .lower()
1380
+ .replace("-", " ")
1381
+ .replace("_", " ")
1382
+ )
1383
+ path_words = set(path_context.split())
1384
+
1385
+ # Check page key for context words
1386
+ key_words = set(
1387
+ key.lower()
1388
+ .replace("-", " ")
1389
+ .replace("_", " ")
1390
+ .split()
1391
+ )
1392
+ title_words = set(
1393
+ page_title.lower()
1394
+ .replace("-", " ")
1395
+ .replace("_", " ")
1396
+ .split()
1397
+ )
1398
+
1399
+ key_context_overlap = len(
1400
+ path_words.intersection(key_words)
1401
+ )
1402
+ title_context_overlap = len(
1403
+ path_words.intersection(title_words)
1404
+ )
1405
+
1406
+ if key_context_overlap > 0 or title_context_overlap > 0:
1407
+ context_bonus = min(
1408
+ 0.2,
1409
+ (key_context_overlap + title_context_overlap)
1410
+ * 0.05,
1411
+ )
1412
+
1413
+ # Apply context bonus to title similarity
1414
+ adjusted_similarity = title_similarity + context_bonus
1415
+
1416
+ # Higher priority for title matches
1417
+ if (
1418
+ adjusted_similarity > best_similarity
1419
+ and adjusted_similarity >= 0.25
1420
+ ):
1421
+ best_similarity = adjusted_similarity
1422
+ best_match = (
1423
+ key,
1424
+ page_data,
1425
+ "title",
1426
+ title_similarity,
1427
+ context_bonus,
1428
+ )
1429
+
1430
+ # Second pass: Only if no good title match, try key matching
1431
+ if (
1432
+ best_similarity < 0.4
1433
+ ): # Only fallback to key matching if title match is poor
1434
+ for key, page_data in self.page_lookup.items():
1435
+ # Calculate similarity between navigation entry and lookup key
1436
+ key_similarity = self.calculate_word_similarity(
1437
+ node_clean, key.replace("-", " ")
1438
+ )
1439
+
1440
+ # Apply same context bonus logic for key matching
1441
+ context_bonus = 0.0
1442
+ if len(path_stack) > 0:
1443
+ path_context = (
1444
+ " ".join(path_stack)
1445
+ .lower()
1446
+ .replace("-", " ")
1447
+ .replace("_", " ")
1448
+ )
1449
+ path_words = set(path_context.split())
1450
+ key_words = set(
1451
+ key.lower()
1452
+ .replace("-", " ")
1453
+ .replace("_", " ")
1454
+ .split()
1455
+ )
1456
+ key_context_overlap = len(
1457
+ path_words.intersection(key_words)
1458
+ )
1459
+ if key_context_overlap > 0:
1460
+ context_bonus = min(
1461
+ 0.2, key_context_overlap * 0.05
1462
+ )
1463
+
1464
+ adjusted_similarity = key_similarity + context_bonus
1465
+
1466
+ if (
1467
+ adjusted_similarity > best_similarity
1468
+ and adjusted_similarity >= 0.25
1469
+ ):
1470
+ best_similarity = adjusted_similarity
1471
+ best_match = (
1472
+ key,
1473
+ page_data,
1474
+ "key",
1475
+ key_similarity,
1476
+ context_bonus,
1477
+ )
1478
+
1479
+ if best_match:
1480
+ page_info = best_match[1]
1481
+ match_type = best_match[2]
1482
+ base_similarity = best_match[3]
1483
+ context_bonus = best_match[4]
1484
+ log.debug(
1485
+ f"✅ Found page using {match_type} matching '{best_match[0]}' (similarity: {base_similarity:.3f} + context: {context_bonus:.3f} = {best_similarity:.3f}) for '{node}' in path {path_stack}"
1486
+ )
1487
+
1488
+ # Strategy 3b: Enhanced fuzzy matching as final fallback
1489
+ if not page_info:
1490
+ possible_keys = list(self.page_lookup.keys())
1491
+ matches = get_close_matches(
1492
+ lookup_key, possible_keys, n=10, cutoff=0.6
1493
+ )
1494
+
1495
+ # If node has .md extension, also try fuzzy matching without it
1496
+ if node.endswith(".md"):
1497
+ node_without_ext = node[:-3]
1498
+ ext_stripped_key = self.normalize_title_key(
1499
+ node_without_ext
1500
+ )
1501
+ ext_matches = get_close_matches(
1502
+ ext_stripped_key, possible_keys, n=10, cutoff=0.6
1503
+ )
1504
+ matches.extend(ext_matches)
1505
+ # Strategy 3c: Try traditional fuzzy matching on the results
1506
+ if not page_info:
1507
+ for match in matches:
1508
+ page_title = self.page_lookup[match].get(
1509
+ "title", ""
1510
+ )
1511
+ # More flexible title matching
1512
+ normalized_page_title = (
1513
+ page_title.lower()
1514
+ .replace(" ", "-")
1515
+ .replace("_", "-")
1516
+ )
1517
+ normalized_node = (
1518
+ node.lower()
1519
+ .replace(" ", "-")
1520
+ .replace("_", "-")
1521
+ .replace(".md", "")
1522
+ )
1523
+
1524
+ if (
1525
+ normalized_page_title == normalized_node
1526
+ or match
1527
+ == self.normalize_title_key(
1528
+ node_without_ext
1529
+ if node.endswith(".md")
1530
+ else node
1531
+ )
1532
+ ):
1533
+ page_info = self.page_lookup[match]
1534
+ log.debug(
1535
+ f"✅ Found page using fuzzy match '{match}' for '{node}'"
1536
+ )
1537
+ break
1538
+
1539
+ if not page_info:
1540
+ log.warning(
1541
+ f"⚠️ No page data found for '{node}' → tried key '{lookup_key}' and fallback '{fallback_key}'"
1542
+ )
1543
+ log.debug(
1544
+ f"🔍 Best similarity was: {best_similarity:.3f} (threshold: 0.25)"
1545
+ )
1546
+ if (
1547
+ len(self.page_lookup) <= 20
1548
+ ): # Only show all keys if there aren't too many
1549
+ log.debug(
1550
+ f"🔍 Available page_lookup keys: {list(self.page_lookup.keys())}"
1551
+ )
1552
+ else:
1553
+ log.debug(
1554
+ f"🔍 {len(self.page_lookup)} page_lookup keys available"
1555
+ )
1556
+ continue
1557
+
1558
+ # Mark this page as processed using the key that actually worked
1559
+ if page_info:
1560
+ # Figure out which key was actually used
1561
+ if lookup_key in self.page_lookup:
1562
+ processed_pages.add(lookup_key)
1563
+ else:
1564
+ fallback_key = self.normalize_title_key(node)
1565
+ if fallback_key in self.page_lookup:
1566
+ processed_pages.add(fallback_key)
1567
+ else:
1568
+ # Must have been found via fuzzy matching, find the actual key
1569
+ for key, info in self.page_lookup.items():
1570
+ if info == page_info:
1571
+ processed_pages.add(key)
1572
+ break
1573
+
1574
+ body = page_info.get("body", "")
1575
+ abs_src_path = page_info.get("abs_src_path")
1576
+ attachments = (
1577
+ self.attachments.get(abs_src_path, []) if abs_src_path else []
1578
+ )
1579
+
1580
+ page_id = self.create_or_update_page(
1581
+ title=page_info.get("title", node),
1582
+ body=body,
1583
+ parent_id=parent_id,
1584
+ attachments=attachments,
1585
+ abs_src_path=abs_src_path,
1586
+ )
1587
+ self.sync_page_attachments(page_id, attachments)
1588
+
1589
+ elif isinstance(node, dict):
1590
+ for folder, children in node.items():
1591
+ folder_title = folder
1592
+ path_stack_full = path_stack + [folder_title]
1593
+ folder_lookup_key = self.normalize_title_key(
1594
+ "/".join(path_stack_full)
1595
+ )
1596
+
1597
+ folder_page_info = self.page_lookup.get(
1598
+ folder_lookup_key,
1599
+ {
1600
+ "title": folder_title,
1601
+ "body": "",
1602
+ "is_folder": True,
1603
+ },
1604
+ )
1605
+
1606
+ # Mark folder as processed if it exists in page_lookup
1607
+ if folder_lookup_key in self.page_lookup:
1608
+ processed_pages.add(folder_lookup_key)
1609
+ else:
1610
+ # Try fallback for folders too
1611
+ fallback_folder_key = self.normalize_title_key(folder_title)
1612
+ if fallback_folder_key in self.page_lookup:
1613
+ folder_page_info = self.page_lookup[fallback_folder_key]
1614
+ processed_pages.add(fallback_folder_key)
1615
+ log.debug(
1616
+ f"✅ Found folder using fallback key '{fallback_folder_key}' for '{folder_title}'"
1617
+ )
1618
+
1619
+ folder_id = self.create_or_update_page(
1620
+ title=folder_page_info.get("title", folder_title),
1621
+ body=folder_page_info.get("body", ""),
1622
+ parent_id=parent_id,
1623
+ is_folder=folder_page_info.get("is_folder", True),
1624
+ )
1625
+ self.build_and_publish_tree(
1626
+ children,
1627
+ parent_id=folder_id,
1628
+ path_stack=path_stack_full,
1629
+ processed_pages=processed_pages,
1630
+ )
1631
+
1632
+ # Report orphan pages (only at the top level to avoid duplicate reporting)
1633
+ if not path_stack: # Only report orphans at the root level
1634
+ orphan_pages = set(self.page_lookup.keys()) - processed_pages
1635
+ for orphan_key in orphan_pages:
1636
+ orphan_info = self.page_lookup[orphan_key]
1637
+ orphan_title = orphan_info.get("title", orphan_key)
1638
+ log.info(
1639
+ f"📄 Orphan page found: '{orphan_title}' (not referenced in navigation)"
1640
+ )
1641
+
1642
+ def build_page_lookup(self):
1643
+ self.page_lookup = {}
1644
+ for page in self.pages:
1645
+ abs_path = page.get("abs_src_path")
1646
+ if not abs_path:
1647
+ continue
1648
+ rel_path = os.path.relpath(abs_path, "docs").replace("\\", "/")
1649
+ path_parts = rel_path.replace(".md", "").split("/")
1650
+ normalized_key = self.normalize_title_key("/".join(path_parts))
1651
+ self.page_lookup[normalized_key] = page
1652
+
1653
+ def debug_dump_page_parents(self):
1654
+ print("🔍 Page parent mapping:")
1655
+ for child, parent in self.page_parents.items():
1656
+ print(f" {child} ← {parent}")
1657
+
1658
+ def dryrun_log(self, action: str, title: str, parent_id=None):
1659
+ """Log dry run actions with consistent formatting."""
1660
+ parent_info = f" under parent ID {parent_id}" if parent_id else ""
1661
+ # Ensure "page" is included in the action for test compatibility
1662
+ if (
1663
+ action.lower() in ["create", "update", "publish"]
1664
+ and "page" not in action.lower()
1665
+ ):
1666
+ action = f"{action} page"
1667
+ log.info(f"DRYRUN: Would {action} '{title}'{parent_info}")
1668
+
1669
+ def _cache_key(self, title: str, parent_id) -> tuple:
1670
+ return (self._normalize_title(title), str(parent_id) if parent_id else None)
1671
+
1672
+ def get_file_sha1(self, file_path):
1673
+ hash_sha1 = hashlib.sha1()
1674
+ with open(file_path, "rb") as f:
1675
+ for chunk in iter(lambda: f.read(4096), b""):
1676
+ hash_sha1.update(chunk)
1677
+ return hash_sha1.hexdigest()