mkdocs-confluence-plugin 1.27.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mkdocs_confluence_plugin/__init__.py +0 -0
- mkdocs_confluence_plugin/plugin.py +1677 -0
- mkdocs_confluence_plugin-1.27.11.dist-info/METADATA +413 -0
- mkdocs_confluence_plugin-1.27.11.dist-info/RECORD +7 -0
- mkdocs_confluence_plugin-1.27.11.dist-info/WHEEL +5 -0
- mkdocs_confluence_plugin-1.27.11.dist-info/entry_points.txt +2 -0
- mkdocs_confluence_plugin-1.27.11.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1677 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import os
|
|
3
|
+
import hashlib
|
|
4
|
+
import sys
|
|
5
|
+
import re
|
|
6
|
+
import requests
|
|
7
|
+
import mimetypes
|
|
8
|
+
import mistune
|
|
9
|
+
import contextlib
|
|
10
|
+
import logging
|
|
11
|
+
from urllib.parse import quote
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
import string
|
|
14
|
+
import mkdocs
|
|
15
|
+
from mkdocs.config import config_options
|
|
16
|
+
from mkdocs.plugins import BasePlugin
|
|
17
|
+
from mkdocs.structure.nav import Navigation
|
|
18
|
+
from mkdocs.structure.pages import Page
|
|
19
|
+
from md2cf.confluence_renderer import ConfluenceRenderer
|
|
20
|
+
from atlassian import Confluence
|
|
21
|
+
from urllib.parse import quote_plus
|
|
22
|
+
from typing import Optional
|
|
23
|
+
from difflib import get_close_matches
|
|
24
|
+
|
|
25
|
+
TEMPLATE_BODY = "<p> TEMPLATE </p>"
|
|
26
|
+
MKDOCS_FOOTER = "This page is auto-generated and will be overwritten at the next run."
|
|
27
|
+
|
|
28
|
+
log = logging.getLogger(__name__)
|
|
29
|
+
log.setLevel(logging.INFO)
|
|
30
|
+
formatter = logging.Formatter("mk2conflu [%(levelname)8s] : %(message)s")
|
|
31
|
+
stream_handler = logging.StreamHandler()
|
|
32
|
+
stream_handler.setFormatter(formatter)
|
|
33
|
+
log.addHandler(stream_handler)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@contextlib.contextmanager
|
|
37
|
+
def nostdout():
|
|
38
|
+
save_stdout = sys.stdout
|
|
39
|
+
sys.stdout = DummyFile()
|
|
40
|
+
yield
|
|
41
|
+
sys.stdout = save_stdout
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DummyFile:
|
|
45
|
+
def write(self, x):
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class ConfluencePlugin(BasePlugin):
|
|
50
|
+
config_scheme = (
|
|
51
|
+
("host_url", config_options.Type(str, default=None)),
|
|
52
|
+
("github_base_url", config_options.Type(str, default=None)),
|
|
53
|
+
("space", config_options.Type(str, default=None)),
|
|
54
|
+
("parent_page_name", config_options.Type(str, default=None)),
|
|
55
|
+
(
|
|
56
|
+
"username",
|
|
57
|
+
config_options.Type(str, default=os.environ.get("CONFLUENCE_USERNAME")),
|
|
58
|
+
),
|
|
59
|
+
(
|
|
60
|
+
"password",
|
|
61
|
+
config_options.Type(str, default=os.environ.get("CONFLUENCE_PASSWORD")),
|
|
62
|
+
),
|
|
63
|
+
("enabled_if_env", config_options.Type(str, default=None)),
|
|
64
|
+
("verbose", config_options.Type(bool, default=False)),
|
|
65
|
+
("debug", config_options.Type(bool, default=False)),
|
|
66
|
+
("dryrun", config_options.Type(bool, default=False)),
|
|
67
|
+
("enable_footer", config_options.Type(bool, default=False)),
|
|
68
|
+
("default_labels", config_options.Type(list, default=["cpe", "mkdocs"])),
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
def __init__(self):
|
|
72
|
+
self.page_lookup = {}
|
|
73
|
+
self.enabled = True
|
|
74
|
+
self.logger = log
|
|
75
|
+
self.confluence_renderer = ConfluenceRenderer(use_xhtml=True)
|
|
76
|
+
self.confluence_mistune = mistune.Markdown(renderer=self.confluence_renderer)
|
|
77
|
+
self.session = requests.Session()
|
|
78
|
+
self.pages = []
|
|
79
|
+
self.page_ids = {}
|
|
80
|
+
self.page_versions = {}
|
|
81
|
+
self.dryrun = False
|
|
82
|
+
self.tab_nav = []
|
|
83
|
+
self.attachments = {}
|
|
84
|
+
self.auth_configured = False
|
|
85
|
+
# Store attachments for deferred processing after all plugins have run
|
|
86
|
+
self.deferred_attachments = []
|
|
87
|
+
|
|
88
|
+
def normalize_title_key(self, title: str) -> str:
|
|
89
|
+
return re.sub(r"[^a-z0-9]+", "-", title.lower()).strip("-")
|
|
90
|
+
|
|
91
|
+
def extract_meaningful_words(self, text: str) -> set:
|
|
92
|
+
"""Extract meaningful words from text, filtering out common prefixes and numbers."""
|
|
93
|
+
# Remove common prefixes and patterns, but be careful not to damage abbreviations
|
|
94
|
+
text = re.sub(
|
|
95
|
+
r"^(kb|rb)-", "", text.lower()
|
|
96
|
+
) # Only remove kb- and rb- prefixes
|
|
97
|
+
text = re.sub(r"^docs?-", "", text) # Remove docs- prefix
|
|
98
|
+
text = re.sub(r"^\d{4}-?", "", text) # Remove leading numbers like "0001-"
|
|
99
|
+
|
|
100
|
+
# Handle common abbreviations and expand them
|
|
101
|
+
abbreviations = {
|
|
102
|
+
"adrs": [
|
|
103
|
+
"architecture",
|
|
104
|
+
"design",
|
|
105
|
+
"records",
|
|
106
|
+
"decision",
|
|
107
|
+
], # Include both design and decision
|
|
108
|
+
"adr": ["architecture", "design", "record", "decision"],
|
|
109
|
+
"arch": ["architecture"],
|
|
110
|
+
"sso": ["single", "sign", "on"],
|
|
111
|
+
"auth": ["authentication", "authorization", "auth"],
|
|
112
|
+
"kb": ["knowledge", "base"],
|
|
113
|
+
"rb": ["runbook"],
|
|
114
|
+
"ci": ["continuous", "integration"],
|
|
115
|
+
"cd": ["continuous", "delivery"],
|
|
116
|
+
"cicd": ["continuous", "integration", "deployment", "delivery"],
|
|
117
|
+
"ci/cd": ["continuous", "integration", "deployment", "delivery"],
|
|
118
|
+
"aws": ["amazon", "web", "services"],
|
|
119
|
+
"api": ["application", "programming", "interface"],
|
|
120
|
+
"apis": ["application", "programming", "interface", "endpoints"],
|
|
121
|
+
"rest": ["representational", "state", "transfer"],
|
|
122
|
+
"ui": ["user", "interface"],
|
|
123
|
+
"db": ["database"],
|
|
124
|
+
"config": ["configuration"],
|
|
125
|
+
"admin": ["administration", "administrator"],
|
|
126
|
+
"mgmt": ["management"],
|
|
127
|
+
"ops": ["operations"],
|
|
128
|
+
"dev": ["development"],
|
|
129
|
+
"prod": ["production"],
|
|
130
|
+
"env": ["environment"],
|
|
131
|
+
"tech": ["technology"],
|
|
132
|
+
"deploy": ["deployment"],
|
|
133
|
+
"troubleshoot": ["troubleshooting"],
|
|
134
|
+
"setup": ["setup", "configuration"],
|
|
135
|
+
"guide": ["guide", "guidelines"],
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
# Split on various separators and filter out short/meaningless words
|
|
139
|
+
words = re.split(r"[-_\s\./]+", text)
|
|
140
|
+
meaningful_words = set()
|
|
141
|
+
|
|
142
|
+
# First, check if the whole text (lowercased) is an abbreviation
|
|
143
|
+
text_lower = text.lower()
|
|
144
|
+
if text_lower in abbreviations:
|
|
145
|
+
meaningful_words.update(abbreviations[text_lower])
|
|
146
|
+
|
|
147
|
+
for word in words:
|
|
148
|
+
word = word.strip().lower()
|
|
149
|
+
if len(word) > 2 and not word.isdigit():
|
|
150
|
+
# Check if word is an abbreviation
|
|
151
|
+
if word in abbreviations:
|
|
152
|
+
meaningful_words.update(abbreviations[word])
|
|
153
|
+
elif word not in {
|
|
154
|
+
"the",
|
|
155
|
+
"and",
|
|
156
|
+
"for",
|
|
157
|
+
"with",
|
|
158
|
+
"are",
|
|
159
|
+
"not",
|
|
160
|
+
"how",
|
|
161
|
+
"can",
|
|
162
|
+
"you",
|
|
163
|
+
"but",
|
|
164
|
+
"was",
|
|
165
|
+
}:
|
|
166
|
+
meaningful_words.add(word)
|
|
167
|
+
elif len(word) == 2 and word in abbreviations:
|
|
168
|
+
# Handle 2-letter abbreviations
|
|
169
|
+
meaningful_words.update(abbreviations[word])
|
|
170
|
+
elif len(word) >= 1 and word in abbreviations:
|
|
171
|
+
# Handle any length abbreviations
|
|
172
|
+
meaningful_words.update(abbreviations[word])
|
|
173
|
+
|
|
174
|
+
return meaningful_words
|
|
175
|
+
|
|
176
|
+
def calculate_word_similarity(self, text1: str, text2: str) -> float:
|
|
177
|
+
"""Calculate similarity between two texts based on shared meaningful words."""
|
|
178
|
+
words1 = self.extract_meaningful_words(text1)
|
|
179
|
+
words2 = self.extract_meaningful_words(text2)
|
|
180
|
+
|
|
181
|
+
if not words1 or not words2:
|
|
182
|
+
return 0.0
|
|
183
|
+
|
|
184
|
+
intersection = words1.intersection(words2)
|
|
185
|
+
union = words1.union(words2)
|
|
186
|
+
|
|
187
|
+
return len(intersection) / len(union) if union else 0.0
|
|
188
|
+
|
|
189
|
+
def on_config(self, config):
|
|
190
|
+
plugin_cfg = self.config
|
|
191
|
+
self.space = self.config.get("space")
|
|
192
|
+
self.enabled = plugin_cfg.get("enabled", True)
|
|
193
|
+
self.only_in_nav = plugin_cfg.get("only_in_nav", False)
|
|
194
|
+
|
|
195
|
+
if not self.enabled:
|
|
196
|
+
return config
|
|
197
|
+
|
|
198
|
+
if not plugin_cfg.get("username"):
|
|
199
|
+
plugin_cfg["username"] = os.environ.get("CONFLUENCE_USERNAME")
|
|
200
|
+
if not plugin_cfg.get("password"):
|
|
201
|
+
plugin_cfg["password"] = os.environ.get("CONFLUENCE_PASSWORD")
|
|
202
|
+
|
|
203
|
+
required_keys = ["host_url", "username", "password", "space"]
|
|
204
|
+
missing_keys = [k for k in required_keys if not plugin_cfg.get(k)]
|
|
205
|
+
if missing_keys:
|
|
206
|
+
raise ValueError(f"Missing required config keys: {', '.join(missing_keys)}")
|
|
207
|
+
|
|
208
|
+
self.confluence = Confluence(
|
|
209
|
+
url=plugin_cfg["host_url"].replace("/rest/api/content", ""),
|
|
210
|
+
username=plugin_cfg["username"],
|
|
211
|
+
password=plugin_cfg["password"],
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
# Configure session for attachment uploads
|
|
215
|
+
self.session.auth = (plugin_cfg["username"], plugin_cfg["password"])
|
|
216
|
+
self.auth_configured = True
|
|
217
|
+
|
|
218
|
+
self.default_labels = plugin_cfg.get("default_labels", ["cpe", "mkdocs"])
|
|
219
|
+
self.dryrun = plugin_cfg.get("dryrun", False)
|
|
220
|
+
|
|
221
|
+
if plugin_cfg.get("debug", False):
|
|
222
|
+
log.setLevel(logging.DEBUG)
|
|
223
|
+
|
|
224
|
+
enabled_if_env = plugin_cfg.get("enabled_if_env")
|
|
225
|
+
if enabled_if_env:
|
|
226
|
+
self.enabled = os.environ.get(enabled_if_env) == "1"
|
|
227
|
+
if not self.enabled:
|
|
228
|
+
log.warning(
|
|
229
|
+
f"Exporting MKDOCS pages to Confluence turned OFF: set env var {enabled_if_env}=1 to enable."
|
|
230
|
+
)
|
|
231
|
+
return config
|
|
232
|
+
else:
|
|
233
|
+
log.info(
|
|
234
|
+
f"Exporting MKDOCS pages to Confluence turned ON (env var {enabled_if_env}=1)."
|
|
235
|
+
)
|
|
236
|
+
else:
|
|
237
|
+
log.info("Exporting MKDOCS pages to Confluence turned ON by default!")
|
|
238
|
+
|
|
239
|
+
if self.dryrun:
|
|
240
|
+
log.warning("DRYRUN MODE ENABLED: No changes will be made to Confluence.")
|
|
241
|
+
|
|
242
|
+
if plugin_cfg.get("parent_page_name"):
|
|
243
|
+
parent_parts = plugin_cfg["parent_page_name"].split("/")
|
|
244
|
+
current_parent_id = None
|
|
245
|
+
|
|
246
|
+
for part in parent_parts:
|
|
247
|
+
page_id = self.find_page_id(part, parent_id=current_parent_id)
|
|
248
|
+
if not page_id:
|
|
249
|
+
if self.dryrun:
|
|
250
|
+
log.warning(
|
|
251
|
+
f"DRYRUN: Would create missing intermediate page: {part}"
|
|
252
|
+
)
|
|
253
|
+
page_id = f"DUMMY_ID_{part}"
|
|
254
|
+
else:
|
|
255
|
+
log.warning(
|
|
256
|
+
f"Intermediate parent page '{part}' not found. Creating it..."
|
|
257
|
+
)
|
|
258
|
+
result = self.confluence.create_page(
|
|
259
|
+
space=plugin_cfg["space"],
|
|
260
|
+
title=part,
|
|
261
|
+
body=TEMPLATE_BODY,
|
|
262
|
+
parent_id=current_parent_id,
|
|
263
|
+
representation="storage",
|
|
264
|
+
)
|
|
265
|
+
if result and "id" in result:
|
|
266
|
+
page_id = result["id"]
|
|
267
|
+
self.page_ids[(part, current_parent_id)] = page_id
|
|
268
|
+
self.page_versions[(part, current_parent_id)] = 1
|
|
269
|
+
log.info(
|
|
270
|
+
f"Created intermediate parent page '{part}' with ID {page_id}"
|
|
271
|
+
)
|
|
272
|
+
else:
|
|
273
|
+
raise ValueError(
|
|
274
|
+
f"Failed to create intermediate parent page: {part}"
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
current_parent_id = page_id
|
|
278
|
+
|
|
279
|
+
self.parent_page_id = current_parent_id
|
|
280
|
+
log.info(
|
|
281
|
+
f"Using final root parent page ID {self.parent_page_id} for path '{plugin_cfg['parent_page_name']}'"
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
return config
|
|
285
|
+
|
|
286
|
+
def on_pre_build(self, config, **kwargs):
|
|
287
|
+
if not self.enabled:
|
|
288
|
+
return
|
|
289
|
+
log.info("🛠️ Pre-building Confluence folder hierarchy before content processing")
|
|
290
|
+
self.create_folder_structure_only(self.tab_nav, parent_id=self.parent_page_id)
|
|
291
|
+
|
|
292
|
+
def _normalize_parent_id(self, parent_id):
|
|
293
|
+
return str(parent_id) if parent_id else None
|
|
294
|
+
|
|
295
|
+
def _collect_all_page_names(self, nav_list):
|
|
296
|
+
result = []
|
|
297
|
+
# Handle the case where nav_list is a dict (for recursive calls)
|
|
298
|
+
if isinstance(nav_list, dict):
|
|
299
|
+
nav_list = [nav_list]
|
|
300
|
+
|
|
301
|
+
for item in nav_list:
|
|
302
|
+
if isinstance(item, dict):
|
|
303
|
+
for key, value in item.items():
|
|
304
|
+
result.append(key)
|
|
305
|
+
result.extend(self._collect_all_page_names(value))
|
|
306
|
+
else:
|
|
307
|
+
result.append(item)
|
|
308
|
+
return result
|
|
309
|
+
|
|
310
|
+
def create_folder_structure_only(self, nav_tree, parent_id=None):
|
|
311
|
+
for node in nav_tree:
|
|
312
|
+
if isinstance(node, str):
|
|
313
|
+
# Leaf node, nothing to do here
|
|
314
|
+
continue
|
|
315
|
+
|
|
316
|
+
if isinstance(node, dict):
|
|
317
|
+
for folder_title, children in node.items():
|
|
318
|
+
norm_title = folder_title.strip()
|
|
319
|
+
norm_key = (
|
|
320
|
+
self._normalize_title(norm_title),
|
|
321
|
+
str(parent_id) if parent_id else None,
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
# Skip if already created
|
|
325
|
+
if norm_key in self.page_ids:
|
|
326
|
+
folder_page_id = self.page_ids[norm_key]
|
|
327
|
+
log.debug(
|
|
328
|
+
f"Folder page '{norm_title}' already cached with ID {folder_page_id}"
|
|
329
|
+
)
|
|
330
|
+
else:
|
|
331
|
+
folder_page_id = self.find_page_id_or_global(
|
|
332
|
+
norm_title, parent_id=parent_id
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
if not folder_page_id:
|
|
336
|
+
if self.dryrun:
|
|
337
|
+
log.info(
|
|
338
|
+
f"DRYRUN: Would create folder page '{norm_title}' under parent ID {parent_id}"
|
|
339
|
+
)
|
|
340
|
+
else:
|
|
341
|
+
log.info(
|
|
342
|
+
f"Creating folder page '{norm_title}' under parent ID {parent_id}"
|
|
343
|
+
)
|
|
344
|
+
try:
|
|
345
|
+
result = self.confluence.create_page(
|
|
346
|
+
space=self.config["space"],
|
|
347
|
+
title=norm_title,
|
|
348
|
+
body="", # No body for folder pages
|
|
349
|
+
parent_id=parent_id,
|
|
350
|
+
representation="storage",
|
|
351
|
+
)
|
|
352
|
+
if result and "id" in result:
|
|
353
|
+
folder_page_id = result["id"]
|
|
354
|
+
self.page_ids[norm_key] = folder_page_id
|
|
355
|
+
self.page_versions[norm_key] = 1
|
|
356
|
+
log.info(
|
|
357
|
+
f"✅ Created folder page '{norm_title}' with ID {folder_page_id}"
|
|
358
|
+
)
|
|
359
|
+
else:
|
|
360
|
+
log.warning(
|
|
361
|
+
f"Failed to create folder page '{norm_title}': No ID returned"
|
|
362
|
+
)
|
|
363
|
+
except Exception as e:
|
|
364
|
+
log.error(
|
|
365
|
+
f"❌ Failed to create folder page '{norm_title}': {e}"
|
|
366
|
+
)
|
|
367
|
+
folder_page_id = None
|
|
368
|
+
else:
|
|
369
|
+
log.error(
|
|
370
|
+
f"❌ Failed to create folder page '{norm_title}'"
|
|
371
|
+
)
|
|
372
|
+
continue
|
|
373
|
+
else:
|
|
374
|
+
self.page_ids[norm_key] = folder_page_id
|
|
375
|
+
self.page_versions[norm_key] = 1
|
|
376
|
+
log.debug(
|
|
377
|
+
f"Found existing folder page '{norm_title}' with ID {folder_page_id}"
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
# ✅ Recurse into children
|
|
381
|
+
self.create_folder_structure_only(
|
|
382
|
+
children, parent_id=folder_page_id
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
def clear_cached_page_info(self):
|
|
386
|
+
self.page_ids.clear()
|
|
387
|
+
self.page_versions.clear()
|
|
388
|
+
|
|
389
|
+
def on_nav(self, nav: Navigation, config, files):
|
|
390
|
+
def add_to_tree(tree, parts):
|
|
391
|
+
part = parts[0].replace("_", " ").title()
|
|
392
|
+
if len(parts) == 1:
|
|
393
|
+
tree.setdefault(part, None)
|
|
394
|
+
else:
|
|
395
|
+
subtree = tree.setdefault(part, {})
|
|
396
|
+
add_to_tree(subtree, parts[1:])
|
|
397
|
+
|
|
398
|
+
tree = {}
|
|
399
|
+
for file in files.documentation_pages():
|
|
400
|
+
parts = file.src_path.split(os.sep)
|
|
401
|
+
if parts[-1].endswith(".md"):
|
|
402
|
+
parts[-1] = parts[-1][:-3]
|
|
403
|
+
add_to_tree(tree, parts)
|
|
404
|
+
|
|
405
|
+
def flatten_tree(t):
|
|
406
|
+
result = []
|
|
407
|
+
for key, value in sorted(t.items()):
|
|
408
|
+
if value is None:
|
|
409
|
+
result.append(key)
|
|
410
|
+
else:
|
|
411
|
+
result.append({key: flatten_tree(value)})
|
|
412
|
+
return result
|
|
413
|
+
|
|
414
|
+
nav_structure = flatten_tree(tree)
|
|
415
|
+
self.tab_nav = nav_structure # Nested nav structure
|
|
416
|
+
|
|
417
|
+
# Build parent-child mapping from nav
|
|
418
|
+
self.page_parents = self._flatten_nav_with_parents(self.tab_nav)
|
|
419
|
+
|
|
420
|
+
log.info(f"Auto-generated nested nav: {nav_structure}")
|
|
421
|
+
|
|
422
|
+
def _flatten_nav_with_parents(self, nav, parent=None):
|
|
423
|
+
result = {}
|
|
424
|
+
# Handle the case where nav is a dict (for recursive calls)
|
|
425
|
+
if isinstance(nav, dict):
|
|
426
|
+
nav = [nav]
|
|
427
|
+
|
|
428
|
+
for item in nav:
|
|
429
|
+
if isinstance(item, str):
|
|
430
|
+
result[item] = parent
|
|
431
|
+
elif isinstance(item, dict):
|
|
432
|
+
for k, v in item.items():
|
|
433
|
+
result[k] = parent
|
|
434
|
+
result.update(self._flatten_nav_with_parents(v, parent=k))
|
|
435
|
+
return result
|
|
436
|
+
|
|
437
|
+
def _build_page_path(self, title):
|
|
438
|
+
path = [title]
|
|
439
|
+
parent = self.page_parents.get(title)
|
|
440
|
+
while parent:
|
|
441
|
+
path.insert(0, parent)
|
|
442
|
+
parent = self.page_parents.get(parent)
|
|
443
|
+
return " / ".join(path)
|
|
444
|
+
|
|
445
|
+
def on_page_markdown(self, markdown, page, config, files):
|
|
446
|
+
"""Capture page content before it's rendered and store by normalized title."""
|
|
447
|
+
abs_src_path = page.file.abs_src_path
|
|
448
|
+
title_key = self.normalize_title_key(page.title)
|
|
449
|
+
rendered = self.confluence_mistune(markdown)
|
|
450
|
+
|
|
451
|
+
page_info = {
|
|
452
|
+
"title": page.title,
|
|
453
|
+
"body": rendered,
|
|
454
|
+
"abs_src_path": abs_src_path,
|
|
455
|
+
"meta": page.meta,
|
|
456
|
+
"url": page.canonical_url,
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
# Store under page title key
|
|
460
|
+
self.page_lookup[title_key] = page_info
|
|
461
|
+
|
|
462
|
+
# Create a reverse lookup from normalized title to page info for fuzzy matching
|
|
463
|
+
if not hasattr(self, "title_to_page"):
|
|
464
|
+
self.title_to_page = {}
|
|
465
|
+
self.title_to_page[title_key] = page_info
|
|
466
|
+
|
|
467
|
+
# Also store under file path key for navigation matching
|
|
468
|
+
if abs_src_path:
|
|
469
|
+
rel_path = os.path.relpath(abs_src_path, "docs").replace("\\", "/")
|
|
470
|
+
# Remove .md extension for the path
|
|
471
|
+
if rel_path.endswith(".md"):
|
|
472
|
+
rel_path = rel_path[:-3]
|
|
473
|
+
path_key = self.normalize_title_key(rel_path)
|
|
474
|
+
self.page_lookup[path_key] = page_info
|
|
475
|
+
|
|
476
|
+
# Also store under just the filename (without directory)
|
|
477
|
+
filename = os.path.basename(rel_path)
|
|
478
|
+
filename_key = self.normalize_title_key(filename)
|
|
479
|
+
self.page_lookup[filename_key] = page_info
|
|
480
|
+
|
|
481
|
+
self.logger.debug(
|
|
482
|
+
f"📥 Cached page content under key '{title_key}' from '{abs_src_path}'"
|
|
483
|
+
)
|
|
484
|
+
if abs_src_path:
|
|
485
|
+
self.logger.debug(
|
|
486
|
+
f"📥 Also cached under path key '{path_key}' and filename key '{filename_key}'"
|
|
487
|
+
)
|
|
488
|
+
return markdown # Let MkDocs proceed as usual
|
|
489
|
+
|
|
490
|
+
def on_page_content(self, html, page, config, files):
|
|
491
|
+
"""Process page content and add footer if enabled."""
|
|
492
|
+
log.debug("🧪 on_page_content called")
|
|
493
|
+
|
|
494
|
+
if not self.config.get("enable_footer"):
|
|
495
|
+
log.debug("🚫 Footer disabled")
|
|
496
|
+
return html
|
|
497
|
+
|
|
498
|
+
github_base_url = self.config.get("github_base_url")
|
|
499
|
+
if not github_base_url:
|
|
500
|
+
log.warning("⚠️ Missing github_base_url - footer cannot be generated")
|
|
501
|
+
return html
|
|
502
|
+
|
|
503
|
+
if not hasattr(page.file, "src_uri"):
|
|
504
|
+
log.warning("❌ No src_uri on page.file - footer cannot be generated")
|
|
505
|
+
return html
|
|
506
|
+
|
|
507
|
+
footer = f'<p><em><a href="{github_base_url}/{page.file.src_uri}">View source on GitHub</a></em></p>'
|
|
508
|
+
log.debug(f"✅ Adding footer: {footer}")
|
|
509
|
+
|
|
510
|
+
# Store the footer in page_lookup for later use in Confluence
|
|
511
|
+
title_key = self.normalize_title_key(page.title)
|
|
512
|
+
if title_key in self.page_lookup:
|
|
513
|
+
self.page_lookup[title_key]["footer"] = footer
|
|
514
|
+
|
|
515
|
+
return html + footer
|
|
516
|
+
|
|
517
|
+
def debug_dump_page_parents(self):
|
|
518
|
+
print("🔍 Page parent mapping:")
|
|
519
|
+
for child, parent in self.page_parents.items():
|
|
520
|
+
print(f" {child} ← {parent}")
|
|
521
|
+
|
|
522
|
+
def on_post_build(self, config, **kwargs):
|
|
523
|
+
if not self.enabled:
|
|
524
|
+
log.info("Confluence plugin disabled; skipping post-build.")
|
|
525
|
+
return
|
|
526
|
+
|
|
527
|
+
log.info(f"🔁 Nav structure for folder pages creation:\n{self.tab_nav}")
|
|
528
|
+
self.debug_dump_pages()
|
|
529
|
+
|
|
530
|
+
# 💡 Optional: Dump the page_lookup keys for debugging
|
|
531
|
+
log.debug(f"📄 Keys in page_lookup: {list(self.page_lookup.keys())}")
|
|
532
|
+
|
|
533
|
+
# 🧩 Populate self.pages based on page_lookup
|
|
534
|
+
self.pages = list(self.page_lookup.values())
|
|
535
|
+
|
|
536
|
+
log.info(f"📄 Total pages defined in MkDocs: {len(self.pages)}")
|
|
537
|
+
|
|
538
|
+
published_titles = [
|
|
539
|
+
self._normalize_title(p["title"]) for p in self.pages if p.get("content")
|
|
540
|
+
]
|
|
541
|
+
all_nav_titles = [
|
|
542
|
+
self._normalize_title(n) for n in self._collect_all_page_names(self.tab_nav)
|
|
543
|
+
]
|
|
544
|
+
|
|
545
|
+
missing = set(published_titles) - set(all_nav_titles)
|
|
546
|
+
if missing:
|
|
547
|
+
log.warning(
|
|
548
|
+
f"🚨 These pages have content but were not matched in nav: {missing}"
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
# ✅ Publish content pages via structured tree
|
|
552
|
+
self.build_and_publish_tree(self.tab_nav, self.parent_page_id)
|
|
553
|
+
|
|
554
|
+
# 🔗 Process all deferred attachments after all pages are created
|
|
555
|
+
if self.deferred_attachments:
|
|
556
|
+
log.info(
|
|
557
|
+
f"🔗 Processing {len(self.deferred_attachments)} deferred attachment collections after all plugins have finished"
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
for i, attachment_info in enumerate(self.deferred_attachments, 1):
|
|
561
|
+
page_id = attachment_info["page_id"]
|
|
562
|
+
page_title = attachment_info["page_title"]
|
|
563
|
+
src_path = attachment_info["src_path"]
|
|
564
|
+
original_content = attachment_info["original_content"]
|
|
565
|
+
|
|
566
|
+
log.debug(
|
|
567
|
+
f"Processing deferred attachments {i}/{len(self.deferred_attachments)} for page '{page_title}' (ID: {page_id})"
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
# Try to collect attachments from original content first (before PlantUML processing)
|
|
571
|
+
attachments = []
|
|
572
|
+
if original_content:
|
|
573
|
+
log.debug(
|
|
574
|
+
f"Attempting to collect attachments from original content"
|
|
575
|
+
)
|
|
576
|
+
attachments = self.collect_page_attachments(
|
|
577
|
+
src_path, original_content
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
# If no attachments found in original content, check if files exist anyway
|
|
581
|
+
# (PlantUML might have generated them and we can detect them by file existence)
|
|
582
|
+
if not attachments:
|
|
583
|
+
log.debug(
|
|
584
|
+
f"No attachments found in original content, checking for generated files..."
|
|
585
|
+
)
|
|
586
|
+
# Re-read the current file content to see what PlantUML might have generated
|
|
587
|
+
if src_path and Path(src_path).exists():
|
|
588
|
+
current_content = Path(src_path).read_text()
|
|
589
|
+
attachments = self.collect_page_attachments(
|
|
590
|
+
src_path, current_content
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
if attachments:
|
|
594
|
+
log.debug(
|
|
595
|
+
f"Found {len(attachments)} attachments for page '{page_title}'"
|
|
596
|
+
)
|
|
597
|
+
for j, attachment in enumerate(attachments, 1):
|
|
598
|
+
try:
|
|
599
|
+
file_size = attachment.stat().st_size
|
|
600
|
+
log.debug(
|
|
601
|
+
f" Attachment {j}: {attachment.name} ({file_size} bytes) - {attachment}"
|
|
602
|
+
)
|
|
603
|
+
except Exception as e:
|
|
604
|
+
log.debug(
|
|
605
|
+
f" Attachment {j}: {attachment.name} - Could not get file size: {e}"
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
if not self.dryrun:
|
|
609
|
+
self.sync_page_attachments(page_id, attachments)
|
|
610
|
+
else:
|
|
611
|
+
log.info(
|
|
612
|
+
f"DRYRUN: Would sync {len(attachments)} attachments for page '{page_title}'"
|
|
613
|
+
)
|
|
614
|
+
else:
|
|
615
|
+
log.debug(f"No attachments found for page '{page_title}'")
|
|
616
|
+
|
|
617
|
+
log.info(f"✅ Completed processing all deferred attachments")
|
|
618
|
+
else:
|
|
619
|
+
log.debug("No deferred attachments to process")
|
|
620
|
+
|
|
621
|
+
def get_page_url(self, title, parent_id=None):
|
|
622
|
+
cache_key = self._cache_key(title, parent_id)
|
|
623
|
+
page_id = self.page_ids.get(cache_key)
|
|
624
|
+
if not page_id:
|
|
625
|
+
page_id = self.find_page_id(title, parent_id)
|
|
626
|
+
if page_id:
|
|
627
|
+
return f"{self.config['host_url'].rstrip('/')}/pages/viewpage.action?pageId={page_id}"
|
|
628
|
+
return None
|
|
629
|
+
|
|
630
|
+
def page_exists(self, title, parent_id=None):
|
|
631
|
+
page_id = self.find_page_id(title, parent_id)
|
|
632
|
+
return (page_id is not None, page_id)
|
|
633
|
+
|
|
634
|
+
def _normalize_title(self, title: str) -> str:
|
|
635
|
+
"""
|
|
636
|
+
Normalize title by lowercasing, removing punctuation, and stripping whitespace.
|
|
637
|
+
Preserves letters and digits, removes spaces and all punctuation characters.
|
|
638
|
+
"""
|
|
639
|
+
title = title.strip().lower()
|
|
640
|
+
return title.translate(str.maketrans("", "", string.punctuation)).replace(
|
|
641
|
+
" ", ""
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
def apply_labels_to_page(self, page_id, labels=None, page_meta=None):
|
|
645
|
+
"""Apply labels to a Confluence page."""
|
|
646
|
+
all_labels = []
|
|
647
|
+
|
|
648
|
+
# Add default labels
|
|
649
|
+
default_labels = getattr(self, "default_labels", [])
|
|
650
|
+
if default_labels:
|
|
651
|
+
all_labels.extend(default_labels)
|
|
652
|
+
|
|
653
|
+
# Add labels from page metadata
|
|
654
|
+
if page_meta:
|
|
655
|
+
page_labels = page_meta.get("labels", []) or page_meta.get("tags", [])
|
|
656
|
+
if page_labels:
|
|
657
|
+
# Ensure labels are strings and clean them
|
|
658
|
+
clean_page_labels = [
|
|
659
|
+
str(label).strip() for label in page_labels if label
|
|
660
|
+
]
|
|
661
|
+
all_labels.extend(clean_page_labels)
|
|
662
|
+
|
|
663
|
+
# Add any explicitly passed labels
|
|
664
|
+
if labels:
|
|
665
|
+
all_labels.extend(labels)
|
|
666
|
+
|
|
667
|
+
# Remove duplicates while preserving order
|
|
668
|
+
unique_labels = []
|
|
669
|
+
seen = set()
|
|
670
|
+
for label in all_labels:
|
|
671
|
+
if label not in seen:
|
|
672
|
+
unique_labels.append(label)
|
|
673
|
+
seen.add(label)
|
|
674
|
+
|
|
675
|
+
if not unique_labels:
|
|
676
|
+
log.debug(f"📝 No labels to apply to page ID {page_id}")
|
|
677
|
+
return
|
|
678
|
+
|
|
679
|
+
if self.dryrun:
|
|
680
|
+
log.info(f"DRYRUN: Would apply labels {unique_labels} to page ID {page_id}")
|
|
681
|
+
return
|
|
682
|
+
|
|
683
|
+
try:
|
|
684
|
+
# Get current labels to avoid duplicates
|
|
685
|
+
current_labels = self.confluence.get_page_labels(page_id)
|
|
686
|
+
current_label_names = [
|
|
687
|
+
label["name"] for label in current_labels.get("results", [])
|
|
688
|
+
]
|
|
689
|
+
|
|
690
|
+
# Only add labels that don't already exist
|
|
691
|
+
new_labels = [
|
|
692
|
+
label for label in unique_labels if label not in current_label_names
|
|
693
|
+
]
|
|
694
|
+
|
|
695
|
+
if new_labels:
|
|
696
|
+
for label in new_labels:
|
|
697
|
+
self.confluence.set_page_label(page_id, label)
|
|
698
|
+
log.debug(f"✅ Applied labels {new_labels} to page ID {page_id}")
|
|
699
|
+
else:
|
|
700
|
+
log.debug(
|
|
701
|
+
f"📝 All labels {unique_labels} already exist on page ID {page_id}"
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
except Exception as e:
|
|
705
|
+
log.error(
|
|
706
|
+
f"❌ Failed to apply labels {unique_labels} to page ID {page_id}: {e}"
|
|
707
|
+
)
|
|
708
|
+
|
|
709
|
+
def create_or_update_page(
|
|
710
|
+
self,
|
|
711
|
+
title,
|
|
712
|
+
body="",
|
|
713
|
+
parent_id=None,
|
|
714
|
+
is_folder=False,
|
|
715
|
+
attachments=None,
|
|
716
|
+
abs_src_path=None,
|
|
717
|
+
):
|
|
718
|
+
"""Create or update a Confluence page. Handles folders, dry run, and logging."""
|
|
719
|
+
if not title:
|
|
720
|
+
log.warning("⚠️ create_or_update_page: Missing title. Skipping.")
|
|
721
|
+
return None
|
|
722
|
+
|
|
723
|
+
key = self.normalize_title_key(title)
|
|
724
|
+
page_exists, existing_id = self.page_exists(title, parent_id)
|
|
725
|
+
|
|
726
|
+
# Get page info to check for footer and metadata
|
|
727
|
+
page_info = None
|
|
728
|
+
title_key = self.normalize_title_key(title)
|
|
729
|
+
page_info = self.page_lookup.get(title_key)
|
|
730
|
+
|
|
731
|
+
if not page_info:
|
|
732
|
+
# Try to find by title match
|
|
733
|
+
for lookup_key, info in self.page_lookup.items():
|
|
734
|
+
if info.get("title") == title:
|
|
735
|
+
page_info = info
|
|
736
|
+
break
|
|
737
|
+
|
|
738
|
+
# Add footer to body if it exists
|
|
739
|
+
final_body = body
|
|
740
|
+
if page_info and page_info.get("footer") and not is_folder:
|
|
741
|
+
final_body = body + page_info["footer"]
|
|
742
|
+
|
|
743
|
+
# Extract metadata for labels
|
|
744
|
+
page_meta = page_info.get("meta", {}) if page_info else {}
|
|
745
|
+
|
|
746
|
+
if page_exists:
|
|
747
|
+
page_id = existing_id
|
|
748
|
+
log.info(f"📝 Page exists: '{title}' (ID={page_id}) — updating.")
|
|
749
|
+
if not self.dryrun:
|
|
750
|
+
self.confluence.update_page(page_id, title, final_body)
|
|
751
|
+
# Apply labels to updated page (including page metadata labels)
|
|
752
|
+
if not is_folder:
|
|
753
|
+
self.apply_labels_to_page(page_id, page_meta=page_meta)
|
|
754
|
+
else:
|
|
755
|
+
self.dryrun_log("update", title, parent_id)
|
|
756
|
+
else:
|
|
757
|
+
log.info(f"🆕 Page does not exist: '{title}' — creating.")
|
|
758
|
+
if not self.dryrun:
|
|
759
|
+
created = self.confluence.create_page(
|
|
760
|
+
self.space, title, final_body, parent_id
|
|
761
|
+
)
|
|
762
|
+
page_id = created.get("id")
|
|
763
|
+
# Apply labels to newly created page (including page metadata labels)
|
|
764
|
+
if page_id and not is_folder:
|
|
765
|
+
self.apply_labels_to_page(page_id, page_meta=page_meta)
|
|
766
|
+
else:
|
|
767
|
+
page_id = f"DRYRUN-{title}"
|
|
768
|
+
self.dryrun_log("create", title, parent_id)
|
|
769
|
+
|
|
770
|
+
# Attachments handling - defer processing until after all plugins have run
|
|
771
|
+
if abs_src_path:
|
|
772
|
+
# Store the original markdown content before any plugins modify it
|
|
773
|
+
original_content = None
|
|
774
|
+
if abs_src_path and Path(abs_src_path).exists():
|
|
775
|
+
original_content = Path(abs_src_path).read_text()
|
|
776
|
+
|
|
777
|
+
# Store attachment info for deferred processing
|
|
778
|
+
attachment_info = {
|
|
779
|
+
"page_id": page_id,
|
|
780
|
+
"page_title": title,
|
|
781
|
+
"src_path": abs_src_path,
|
|
782
|
+
"original_content": original_content,
|
|
783
|
+
"processed_content": body,
|
|
784
|
+
}
|
|
785
|
+
self.deferred_attachments.append(attachment_info)
|
|
786
|
+
log.debug(
|
|
787
|
+
f"Deferred attachment processing for page '{title}' (ID: {page_id})"
|
|
788
|
+
)
|
|
789
|
+
|
|
790
|
+
self.page_ids[key] = page_id
|
|
791
|
+
return page_id
|
|
792
|
+
|
|
793
|
+
def create_page(self, title, body, parent_id, is_folder=False):
|
|
794
|
+
norm_title = self._normalize_title(title)
|
|
795
|
+
norm_parent_id = str(parent_id) if parent_id else None
|
|
796
|
+
cache_key = (norm_title, norm_parent_id)
|
|
797
|
+
|
|
798
|
+
if self.dryrun:
|
|
799
|
+
self.dryrun_log("create page", title, parent_id)
|
|
800
|
+
return f"DUMMY_ID_{title}"
|
|
801
|
+
|
|
802
|
+
# Get page metadata for labels
|
|
803
|
+
title_key = self.normalize_title_key(title)
|
|
804
|
+
page_info = self.page_lookup.get(title_key, {})
|
|
805
|
+
page_meta = page_info.get("meta", {})
|
|
806
|
+
|
|
807
|
+
try:
|
|
808
|
+
log.info(
|
|
809
|
+
f"📄 Attempting to create page '{title}' under parent ID {parent_id}"
|
|
810
|
+
)
|
|
811
|
+
# Use empty string for folder body, avoid TEMPLATE_BODY for child/content pages
|
|
812
|
+
body_to_use = "" if is_folder else (body or "")
|
|
813
|
+
result = self.confluence.create_page(
|
|
814
|
+
space=self.config["space"],
|
|
815
|
+
title=title,
|
|
816
|
+
body=body_to_use,
|
|
817
|
+
parent_id=parent_id,
|
|
818
|
+
representation="storage",
|
|
819
|
+
)
|
|
820
|
+
if result and "id" in result:
|
|
821
|
+
page_id = result["id"]
|
|
822
|
+
self.page_ids[cache_key] = page_id
|
|
823
|
+
self.page_versions[cache_key] = 1
|
|
824
|
+
|
|
825
|
+
# Apply labels to newly created page (including page metadata labels)
|
|
826
|
+
if not is_folder:
|
|
827
|
+
self.apply_labels_to_page(page_id, page_meta=page_meta)
|
|
828
|
+
|
|
829
|
+
log.info(
|
|
830
|
+
f"✅ Created {'folder' if is_folder else 'content'} page '{title}' with ID {page_id}"
|
|
831
|
+
)
|
|
832
|
+
return page_id
|
|
833
|
+
except Exception as e:
|
|
834
|
+
if "already exists with the same TITLE" in str(e):
|
|
835
|
+
log.warning(
|
|
836
|
+
f"⚠️ Page '{title}' already exists — attempting update instead"
|
|
837
|
+
)
|
|
838
|
+
else:
|
|
839
|
+
log.error(f"❌ Failed to create page '{title}': {e}", exc_info=True)
|
|
840
|
+
return None
|
|
841
|
+
|
|
842
|
+
# Fallback: update existing page if creation fails
|
|
843
|
+
page_id = self.find_page_id(title, parent_id)
|
|
844
|
+
if not page_id:
|
|
845
|
+
log.error(
|
|
846
|
+
f"❌ Cannot update '{title}': page ID not found after creation failure"
|
|
847
|
+
)
|
|
848
|
+
return None
|
|
849
|
+
|
|
850
|
+
prev_version = self.page_versions.get(cache_key, 1)
|
|
851
|
+
new_version = prev_version + 1
|
|
852
|
+
|
|
853
|
+
try:
|
|
854
|
+
log.info(
|
|
855
|
+
f"🔁 Updating page '{title}' (ID {page_id}) to version {new_version}"
|
|
856
|
+
)
|
|
857
|
+
self.confluence.update_page(
|
|
858
|
+
page_id=page_id,
|
|
859
|
+
title=title,
|
|
860
|
+
body="" if is_folder else (body or ""), # Folder pages get empty body
|
|
861
|
+
parent_id=parent_id,
|
|
862
|
+
type="page",
|
|
863
|
+
representation="storage",
|
|
864
|
+
minor_edit=False,
|
|
865
|
+
)
|
|
866
|
+
self.page_ids[cache_key] = page_id
|
|
867
|
+
self.page_versions[cache_key] = new_version
|
|
868
|
+
|
|
869
|
+
# Apply labels to updated page (including page metadata labels)
|
|
870
|
+
if not is_folder:
|
|
871
|
+
self.apply_labels_to_page(page_id, page_meta=page_meta)
|
|
872
|
+
|
|
873
|
+
log.info(f"✅ Updated page '{title}' (version {new_version})")
|
|
874
|
+
return page_id
|
|
875
|
+
except Exception as e:
|
|
876
|
+
log.error(
|
|
877
|
+
f"❌ Failed to update page '{title}' (ID {page_id}): {e}", exc_info=True
|
|
878
|
+
)
|
|
879
|
+
return None
|
|
880
|
+
|
|
881
|
+
def publish_page(self, page_title, body, parent_id, source_path=None, dryrun=False):
|
|
882
|
+
norm_title = self._normalize_title(page_title)
|
|
883
|
+
norm_parent_id = str(parent_id) if parent_id else None
|
|
884
|
+
cache_key = (norm_title, norm_parent_id)
|
|
885
|
+
|
|
886
|
+
if dryrun:
|
|
887
|
+
self.dryrun_log("publish page", page_title, parent_id)
|
|
888
|
+
return f"DUMMY_ID_{page_title}"
|
|
889
|
+
|
|
890
|
+
# Get page metadata for labels
|
|
891
|
+
title_key = self.normalize_title_key(page_title)
|
|
892
|
+
page_info = self.page_lookup.get(title_key, {})
|
|
893
|
+
page_meta = page_info.get("meta", {})
|
|
894
|
+
|
|
895
|
+
# Try to create page first
|
|
896
|
+
try:
|
|
897
|
+
log.info(f"📄 Creating page '{page_title}' under parent ID {parent_id}")
|
|
898
|
+
result = self.confluence.create_page(
|
|
899
|
+
space=self.config["space"],
|
|
900
|
+
title=page_title,
|
|
901
|
+
body=body or "",
|
|
902
|
+
parent_id=parent_id,
|
|
903
|
+
representation="storage",
|
|
904
|
+
)
|
|
905
|
+
if result and "id" in result:
|
|
906
|
+
page_id = result["id"]
|
|
907
|
+
self.page_ids[cache_key] = page_id
|
|
908
|
+
self.page_versions[cache_key] = 1
|
|
909
|
+
|
|
910
|
+
# Apply labels to newly created page
|
|
911
|
+
self.apply_labels_to_page(page_id, page_meta=page_meta)
|
|
912
|
+
|
|
913
|
+
log.info(f"✅ Created page '{page_title}' with ID {page_id}")
|
|
914
|
+
return page_id
|
|
915
|
+
except Exception as e:
|
|
916
|
+
if "already exists with the same TITLE" in str(e):
|
|
917
|
+
log.warning(f"⚠️ Page '{page_title}' already exists — attempting update")
|
|
918
|
+
else:
|
|
919
|
+
log.error(
|
|
920
|
+
f"❌ Failed to create page '{page_title}': {e}", exc_info=True
|
|
921
|
+
)
|
|
922
|
+
return None
|
|
923
|
+
|
|
924
|
+
# Fallback: Update existing page
|
|
925
|
+
page_id = self.find_page_id(page_title, parent_id)
|
|
926
|
+
if not page_id:
|
|
927
|
+
log.error(f"❌ Cannot update '{page_title}': page ID not found")
|
|
928
|
+
return None
|
|
929
|
+
|
|
930
|
+
prev_version = self.page_versions.get(cache_key, 1)
|
|
931
|
+
new_version = prev_version + 1
|
|
932
|
+
try:
|
|
933
|
+
log.info(
|
|
934
|
+
f"🔁 Updating page '{page_title}' (ID {page_id}) to version {new_version}"
|
|
935
|
+
)
|
|
936
|
+
self.confluence.update_page(
|
|
937
|
+
page_id=page_id,
|
|
938
|
+
title=page_title,
|
|
939
|
+
body=body or "",
|
|
940
|
+
parent_id=parent_id,
|
|
941
|
+
type="page",
|
|
942
|
+
representation="storage",
|
|
943
|
+
minor_edit=False,
|
|
944
|
+
)
|
|
945
|
+
self.page_ids[cache_key] = page_id
|
|
946
|
+
self.page_versions[cache_key] = new_version
|
|
947
|
+
|
|
948
|
+
# Apply labels to updated page
|
|
949
|
+
self.apply_labels_to_page(page_id, page_meta=page_meta)
|
|
950
|
+
|
|
951
|
+
log.info(f"✅ Updated page '{page_title}' (version {new_version})")
|
|
952
|
+
return page_id
|
|
953
|
+
except Exception as e:
|
|
954
|
+
log.error(
|
|
955
|
+
f"❌ Failed to update page '{page_title}' (ID {page_id}): {e}",
|
|
956
|
+
exc_info=True,
|
|
957
|
+
)
|
|
958
|
+
return None
|
|
959
|
+
|
|
960
|
+
def find_or_create_page(self, title, parent_id=None, is_folder=False):
|
|
961
|
+
norm_title = self._normalize_title(title)
|
|
962
|
+
norm_parent_id = str(parent_id) if parent_id is not None else None
|
|
963
|
+
cache_key = self._cache_key(title, norm_parent_id)
|
|
964
|
+
|
|
965
|
+
page_id = self.find_page_id(title, parent_id=parent_id)
|
|
966
|
+
if page_id:
|
|
967
|
+
return page_id
|
|
968
|
+
|
|
969
|
+
log.info(f"Creating Confluence page '{title}' under parent ID {parent_id}")
|
|
970
|
+
if self.dryrun:
|
|
971
|
+
self.dryrun_log("create", title, parent_id)
|
|
972
|
+
return f"DUMMY_ID_{title}"
|
|
973
|
+
|
|
974
|
+
result = self.confluence.create_page(
|
|
975
|
+
space=self.config["space"],
|
|
976
|
+
title=title,
|
|
977
|
+
body="" if is_folder else TEMPLATE_BODY,
|
|
978
|
+
parent_id=parent_id,
|
|
979
|
+
representation="storage",
|
|
980
|
+
)
|
|
981
|
+
if result and "id" in result:
|
|
982
|
+
page_id = result["id"]
|
|
983
|
+
self.page_ids[cache_key] = page_id
|
|
984
|
+
self.page_versions[cache_key] = 1
|
|
985
|
+
return page_id
|
|
986
|
+
|
|
987
|
+
log.error(f"Failed to create or find page '{title}'")
|
|
988
|
+
return None
|
|
989
|
+
|
|
990
|
+
def find_page_id(self, title: str, parent_id: str | None = None) -> str | None:
|
|
991
|
+
"""
|
|
992
|
+
Find a Confluence page ID by its title and parent page ID.
|
|
993
|
+
If parent_id is None, search top-level pages in the space.
|
|
994
|
+
|
|
995
|
+
Returns page ID if found, else None.
|
|
996
|
+
"""
|
|
997
|
+
# Normalize title for consistent lookup if needed (depends on your implementation)
|
|
998
|
+
normalized_title = title.strip().lower()
|
|
999
|
+
|
|
1000
|
+
# 1) Search children of parent page if parent_id provided
|
|
1001
|
+
if parent_id:
|
|
1002
|
+
children = self.confluence.get_page_child_by_type(parent_id, "page")
|
|
1003
|
+
for child in children:
|
|
1004
|
+
if child["title"].strip().lower() == normalized_title:
|
|
1005
|
+
return child["id"]
|
|
1006
|
+
|
|
1007
|
+
# 2) If no parent or not found above, search globally in space by title
|
|
1008
|
+
# Use Confluence CQL (Confluence Query Language) to search pages by title in the space
|
|
1009
|
+
cql = f'title="{title}" and space="{self.config["space"]}" and type="page"'
|
|
1010
|
+
search_result = self.confluence.cql(cql, limit=10)
|
|
1011
|
+
for result in search_result.get("results", []):
|
|
1012
|
+
page = result.get("content")
|
|
1013
|
+
if page and page.get("title", "").strip().lower() == normalized_title:
|
|
1014
|
+
return page.get("id")
|
|
1015
|
+
|
|
1016
|
+
# Not found
|
|
1017
|
+
return None
|
|
1018
|
+
|
|
1019
|
+
def find_page_id_global(self, title):
|
|
1020
|
+
cql = f'title = "{title}" and space = "{self.config["space"]}"'
|
|
1021
|
+
results = self.confluence.cql(cql)
|
|
1022
|
+
if results.get("results"):
|
|
1023
|
+
page = results["results"][0]
|
|
1024
|
+
page_id = page.get("id") or page.get("content", {}).get("id")
|
|
1025
|
+
version = page.get("version", {}).get("number", 1)
|
|
1026
|
+
log.debug(
|
|
1027
|
+
f"Found global page '{title}' with ID {page_id} (version {version})"
|
|
1028
|
+
)
|
|
1029
|
+
return page_id
|
|
1030
|
+
return None
|
|
1031
|
+
|
|
1032
|
+
def find_page_id_or_global(self, title, parent_id=None):
|
|
1033
|
+
norm_parent_id = self._normalize_parent_id(parent_id)
|
|
1034
|
+
norm_title = self._normalize_title(title)
|
|
1035
|
+
key = (norm_title, norm_parent_id)
|
|
1036
|
+
|
|
1037
|
+
if key in self.page_ids:
|
|
1038
|
+
return self.page_ids[key]
|
|
1039
|
+
|
|
1040
|
+
page_id = self.find_page_id(title, parent_id)
|
|
1041
|
+
if page_id:
|
|
1042
|
+
self.page_ids[key] = page_id
|
|
1043
|
+
return page_id
|
|
1044
|
+
|
|
1045
|
+
log.debug(
|
|
1046
|
+
f"Page '{title}' not found with parent ID {parent_id}, trying global lookup"
|
|
1047
|
+
)
|
|
1048
|
+
page_id = self.find_page_id_global(title)
|
|
1049
|
+
if page_id:
|
|
1050
|
+
self.page_ids[(norm_title, None)] = page_id
|
|
1051
|
+
return page_id
|
|
1052
|
+
|
|
1053
|
+
def collect_page_attachments(self, src_path, content):
|
|
1054
|
+
"""Collect attachment files referenced in the markdown content."""
|
|
1055
|
+
import re
|
|
1056
|
+
from pathlib import Path
|
|
1057
|
+
|
|
1058
|
+
attachments = []
|
|
1059
|
+
if not src_path:
|
|
1060
|
+
log.debug("collect_page_attachments: No source path provided")
|
|
1061
|
+
return attachments
|
|
1062
|
+
|
|
1063
|
+
src_dir = Path(src_path).parent
|
|
1064
|
+
log.debug(f"Collecting attachments from {src_path} (source dir: {src_dir})")
|
|
1065
|
+
|
|
1066
|
+
# Find markdown image references:  and 
|
|
1067
|
+
img_pattern = r"!\[([^\]]*)\]\(([^)]+)\)"
|
|
1068
|
+
matches = re.findall(img_pattern, content)
|
|
1069
|
+
log.debug(
|
|
1070
|
+
f"Found {len(matches)} image references in markdown: {[match[1] for match in matches]}"
|
|
1071
|
+
)
|
|
1072
|
+
|
|
1073
|
+
for alt_text, img_path in matches:
|
|
1074
|
+
# Remove any quotes and title text
|
|
1075
|
+
img_path = img_path.split('"')[0].strip()
|
|
1076
|
+
|
|
1077
|
+
# Skip external URLs
|
|
1078
|
+
if img_path.startswith(("http://", "https://", "//")):
|
|
1079
|
+
continue
|
|
1080
|
+
|
|
1081
|
+
img_file = None
|
|
1082
|
+
|
|
1083
|
+
# Handle relative paths - try multiple resolution strategies
|
|
1084
|
+
if img_path.startswith("./"):
|
|
1085
|
+
# Remove ./ prefix
|
|
1086
|
+
img_path = img_path[2:]
|
|
1087
|
+
img_file = src_dir / img_path
|
|
1088
|
+
elif img_path.startswith("../"):
|
|
1089
|
+
# Handle parent directory references - try multiple strategies
|
|
1090
|
+
|
|
1091
|
+
# Strategy 1: Resolve relative to source file
|
|
1092
|
+
img_file = (src_dir / img_path).resolve()
|
|
1093
|
+
|
|
1094
|
+
# Strategy 2: If not found, try relative to docs root
|
|
1095
|
+
if not img_file.exists():
|
|
1096
|
+
# If the path goes up to project root, try prefixing with docs/
|
|
1097
|
+
if img_path.startswith("../../../"):
|
|
1098
|
+
# This likely goes to project root, so try docs/ prefix
|
|
1099
|
+
alt_path = img_path[9:] # Remove ../../../
|
|
1100
|
+
img_file = Path("docs") / alt_path
|
|
1101
|
+
|
|
1102
|
+
# Strategy 3: Try relative to project root
|
|
1103
|
+
if not img_file.exists() and img_path.startswith("../"):
|
|
1104
|
+
# Resolve from source directory and see if it makes sense
|
|
1105
|
+
try:
|
|
1106
|
+
project_relative = (src_dir / img_path).resolve()
|
|
1107
|
+
if project_relative.exists():
|
|
1108
|
+
img_file = project_relative
|
|
1109
|
+
except:
|
|
1110
|
+
pass
|
|
1111
|
+
|
|
1112
|
+
else:
|
|
1113
|
+
# Non-relative paths: try both relative to source file and relative to docs root
|
|
1114
|
+
img_file = src_dir / img_path
|
|
1115
|
+
if not img_file.exists():
|
|
1116
|
+
img_file = Path("docs") / img_path
|
|
1117
|
+
|
|
1118
|
+
# Check if file exists and is an image
|
|
1119
|
+
if (
|
|
1120
|
+
img_file
|
|
1121
|
+
and img_file.exists()
|
|
1122
|
+
and img_file.suffix.lower()
|
|
1123
|
+
in (".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf", ".webp")
|
|
1124
|
+
):
|
|
1125
|
+
resolved_path = img_file.resolve()
|
|
1126
|
+
file_size = resolved_path.stat().st_size
|
|
1127
|
+
attachments.append(resolved_path)
|
|
1128
|
+
log.debug(
|
|
1129
|
+
f"✓ Found attachment: {img_file} ({file_size} bytes) from markdown reference: {img_path}"
|
|
1130
|
+
)
|
|
1131
|
+
else:
|
|
1132
|
+
log.warning(
|
|
1133
|
+
f"✗ Referenced image not found: {img_path} (resolved to {img_file})"
|
|
1134
|
+
)
|
|
1135
|
+
|
|
1136
|
+
return attachments
|
|
1137
|
+
|
|
1138
|
+
def sync_page_attachments(self, page_id, attachments):
|
|
1139
|
+
"""Sync attachments for a page."""
|
|
1140
|
+
if not self.auth_configured:
|
|
1141
|
+
log.warning("Authentication not configured for attachment uploads")
|
|
1142
|
+
return
|
|
1143
|
+
|
|
1144
|
+
if not attachments:
|
|
1145
|
+
log.debug(f"No attachments to sync for page ID {page_id}")
|
|
1146
|
+
return
|
|
1147
|
+
|
|
1148
|
+
log.info(f"Syncing {len(attachments)} attachments for page ID {page_id}")
|
|
1149
|
+
for i, attachment_path in enumerate(attachments, 1):
|
|
1150
|
+
try:
|
|
1151
|
+
log.debug(
|
|
1152
|
+
f"Processing attachment {i}/{len(attachments)}: {attachment_path.name}"
|
|
1153
|
+
)
|
|
1154
|
+
self.add_or_update_attachment(page_id, attachment_path)
|
|
1155
|
+
except Exception as e:
|
|
1156
|
+
log.error(f"Failed to sync attachment {attachment_path}: {e}")
|
|
1157
|
+
|
|
1158
|
+
def add_or_update_attachment(self, page_id, filepath):
|
|
1159
|
+
"""Add or update an attachment for a page."""
|
|
1160
|
+
if not self.auth_configured:
|
|
1161
|
+
log.warning("Authentication not configured for attachment uploads")
|
|
1162
|
+
return
|
|
1163
|
+
|
|
1164
|
+
try:
|
|
1165
|
+
file_size = filepath.stat().st_size
|
|
1166
|
+
log.info(
|
|
1167
|
+
f"Handling attachment: file '{filepath.name}' ({file_size} bytes) for page ID {page_id}"
|
|
1168
|
+
)
|
|
1169
|
+
except Exception as e:
|
|
1170
|
+
log.info(
|
|
1171
|
+
f"Handling attachment: file '{filepath.name}' (size unknown: {e}) for page ID {page_id}"
|
|
1172
|
+
)
|
|
1173
|
+
|
|
1174
|
+
if not page_id:
|
|
1175
|
+
log.error("Cannot upload attachment: Page ID is missing")
|
|
1176
|
+
return
|
|
1177
|
+
|
|
1178
|
+
try:
|
|
1179
|
+
file_hash = self.get_file_sha1(filepath)
|
|
1180
|
+
attachment_comment = f"ConfluencePlugin [v{file_hash}]"
|
|
1181
|
+
log.debug(f"Attachment '{filepath.name}' hash: {file_hash}")
|
|
1182
|
+
|
|
1183
|
+
existing_attachment = self.get_attachment(page_id, filepath)
|
|
1184
|
+
if existing_attachment:
|
|
1185
|
+
file_hash_regex = re.compile(r"\[v([a-f0-9]+)\]")
|
|
1186
|
+
current_hash_match = file_hash_regex.search(
|
|
1187
|
+
existing_attachment.get("metadata", {}).get("comment", "")
|
|
1188
|
+
)
|
|
1189
|
+
if current_hash_match and current_hash_match.group(1) == file_hash:
|
|
1190
|
+
log.info(
|
|
1191
|
+
f"Attachment '{filepath.name}' is up-to-date. Skipping upload."
|
|
1192
|
+
)
|
|
1193
|
+
return
|
|
1194
|
+
else:
|
|
1195
|
+
log.debug(
|
|
1196
|
+
f"Attachment '{filepath.name}' has changed (old hash: {current_hash_match.group(1) if current_hash_match else 'unknown'}, new hash: {file_hash})"
|
|
1197
|
+
)
|
|
1198
|
+
self.delete_attachment(existing_attachment["id"])
|
|
1199
|
+
log.info(f"Deleted outdated attachment '{filepath.name}'.")
|
|
1200
|
+
else:
|
|
1201
|
+
log.debug(
|
|
1202
|
+
f"No existing attachment found for '{filepath.name}', will upload new one"
|
|
1203
|
+
)
|
|
1204
|
+
|
|
1205
|
+
self.upload_attachment(page_id, filepath, attachment_comment)
|
|
1206
|
+
except Exception as e:
|
|
1207
|
+
log.error(f"Error handling attachment {filepath}: {e}")
|
|
1208
|
+
|
|
1209
|
+
def get_attachment(self, page_id, filepath):
|
|
1210
|
+
"""Get existing attachment by page ID and filename."""
|
|
1211
|
+
try:
|
|
1212
|
+
# Use base URL without /rest/api/content since we add it below
|
|
1213
|
+
base_url = self.config["host_url"].replace("/rest/api/content", "")
|
|
1214
|
+
url = f"{base_url}/rest/api/content/{page_id}/child/attachment"
|
|
1215
|
+
params = {"filename": filepath.name}
|
|
1216
|
+
response = self.session.get(url, params=params)
|
|
1217
|
+
if response.status_code == 200:
|
|
1218
|
+
results = response.json().get("results", [])
|
|
1219
|
+
if results:
|
|
1220
|
+
return results[0]
|
|
1221
|
+
elif response.status_code != 404:
|
|
1222
|
+
log.warning(
|
|
1223
|
+
f"Failed to check existing attachment (status {response.status_code}): {response.text}"
|
|
1224
|
+
)
|
|
1225
|
+
except Exception as e:
|
|
1226
|
+
log.error(f"Error checking existing attachment: {e}")
|
|
1227
|
+
return None
|
|
1228
|
+
|
|
1229
|
+
def upload_attachment(self, page_id, filepath, comment):
|
|
1230
|
+
"""Upload an attachment to a page."""
|
|
1231
|
+
try:
|
|
1232
|
+
file_size = filepath.stat().st_size
|
|
1233
|
+
log.debug(
|
|
1234
|
+
f"Starting upload of '{filepath.name}' ({file_size} bytes) to page ID {page_id}"
|
|
1235
|
+
)
|
|
1236
|
+
|
|
1237
|
+
# Use base URL without /rest/api/content since we add it below
|
|
1238
|
+
base_url = self.config["host_url"].replace("/rest/api/content", "")
|
|
1239
|
+
url = f"{base_url}/rest/api/content/{page_id}/child/attachment"
|
|
1240
|
+
log.debug(f"Upload URL: {url}")
|
|
1241
|
+
|
|
1242
|
+
# Set headers for Confluence Cloud API
|
|
1243
|
+
headers = {
|
|
1244
|
+
"X-Atlassian-Token": "no-check", # Disable XSRF check
|
|
1245
|
+
}
|
|
1246
|
+
|
|
1247
|
+
with open(filepath, "rb") as f:
|
|
1248
|
+
files = {
|
|
1249
|
+
"file": (filepath.name, f, mimetypes.guess_type(filepath.name)[0])
|
|
1250
|
+
}
|
|
1251
|
+
data = {"comment": comment}
|
|
1252
|
+
log.debug(f"Uploading file with comment: {comment}")
|
|
1253
|
+
response = self.session.post(
|
|
1254
|
+
url, files=files, data=data, headers=headers
|
|
1255
|
+
)
|
|
1256
|
+
|
|
1257
|
+
if response.status_code in (200, 201):
|
|
1258
|
+
log.info(
|
|
1259
|
+
f"✓ Successfully uploaded attachment '{filepath.name}' ({file_size} bytes) to page ID {page_id}"
|
|
1260
|
+
)
|
|
1261
|
+
log.debug(f"Upload response status: {response.status_code}")
|
|
1262
|
+
else:
|
|
1263
|
+
log.error(
|
|
1264
|
+
f"✗ Failed to upload attachment '{filepath.name}' (status {response.status_code}): {response.text}"
|
|
1265
|
+
)
|
|
1266
|
+
except Exception as e:
|
|
1267
|
+
log.error(f"✗ Error uploading attachment {filepath}: {e}")
|
|
1268
|
+
|
|
1269
|
+
def delete_attachment(self, attachment_id):
|
|
1270
|
+
"""Delete an attachment by ID."""
|
|
1271
|
+
try:
|
|
1272
|
+
# Use base URL without /rest/api/content since we add it below
|
|
1273
|
+
base_url = self.config["host_url"].replace("/rest/api/content", "")
|
|
1274
|
+
url = f"{base_url}/rest/api/content/{attachment_id}"
|
|
1275
|
+
response = self.session.delete(url)
|
|
1276
|
+
if response.status_code == 204:
|
|
1277
|
+
log.info(f"Deleted attachment ID {attachment_id}.")
|
|
1278
|
+
else:
|
|
1279
|
+
log.error(
|
|
1280
|
+
f"Failed to delete attachment ID {attachment_id} (status {response.status_code}): {response.text}"
|
|
1281
|
+
)
|
|
1282
|
+
except Exception as e:
|
|
1283
|
+
log.error(f"Error deleting attachment {attachment_id}: {e}")
|
|
1284
|
+
|
|
1285
|
+
def debug_dump_pages(self):
|
|
1286
|
+
if not self.pages:
|
|
1287
|
+
log.warning("⚠️ debug_dump_pages: self.pages is empty.")
|
|
1288
|
+
return
|
|
1289
|
+
|
|
1290
|
+
log.info(f"📄 Debug dump of self.pages ({len(self.pages)} entries):")
|
|
1291
|
+
for idx, page in enumerate(self.pages, 1):
|
|
1292
|
+
title = page.get("title", "<no title>")
|
|
1293
|
+
parent_id = (
|
|
1294
|
+
str(page.get("parent_id"))
|
|
1295
|
+
if page.get("parent_id") is not None
|
|
1296
|
+
else "None"
|
|
1297
|
+
)
|
|
1298
|
+
body = page.get("body", "")
|
|
1299
|
+
is_folder = page.get("is_folder", False)
|
|
1300
|
+
body_preview = body[:60].replace("\n", " ") + (
|
|
1301
|
+
"..." if len(body) > 60 else ""
|
|
1302
|
+
)
|
|
1303
|
+
log.info(
|
|
1304
|
+
f" {idx:3}: Title='{title}', ParentID='{parent_id}' ({type(parent_id).__name__}), "
|
|
1305
|
+
f"IsFolder={is_folder}, BodyLen={len(body)}, BodyPreview='{body_preview}'"
|
|
1306
|
+
)
|
|
1307
|
+
|
|
1308
|
+
log.info("✅ End of debug dump.")
|
|
1309
|
+
|
|
1310
|
+
def build_and_publish_tree(
|
|
1311
|
+
self,
|
|
1312
|
+
nav_tree: list,
|
|
1313
|
+
parent_id: Optional[str] = None,
|
|
1314
|
+
path_stack: list = None,
|
|
1315
|
+
processed_pages: set = None,
|
|
1316
|
+
):
|
|
1317
|
+
if path_stack is None:
|
|
1318
|
+
path_stack = []
|
|
1319
|
+
|
|
1320
|
+
# Initialize processed_pages set at the top level
|
|
1321
|
+
if processed_pages is None:
|
|
1322
|
+
processed_pages = set()
|
|
1323
|
+
|
|
1324
|
+
for node in nav_tree:
|
|
1325
|
+
if isinstance(node, str):
|
|
1326
|
+
path_stack_full = path_stack + [node]
|
|
1327
|
+
lookup_key = self.normalize_title_key("/".join(path_stack_full))
|
|
1328
|
+
|
|
1329
|
+
page_info = self.page_lookup.get(lookup_key)
|
|
1330
|
+
|
|
1331
|
+
# If not found, try fallback strategies
|
|
1332
|
+
if not page_info:
|
|
1333
|
+
# Strategy 1: Try just the node name
|
|
1334
|
+
fallback_key = self.normalize_title_key(node)
|
|
1335
|
+
page_info = self.page_lookup.get(fallback_key)
|
|
1336
|
+
if page_info:
|
|
1337
|
+
log.debug(
|
|
1338
|
+
f"✅ Found page using fallback key '{fallback_key}' for '{node}'"
|
|
1339
|
+
)
|
|
1340
|
+
|
|
1341
|
+
# Strategy 2: Try removing .md extension if present
|
|
1342
|
+
if not page_info and node.endswith(".md"):
|
|
1343
|
+
node_without_ext = node[:-3] # Remove .md
|
|
1344
|
+
ext_fallback_key = self.normalize_title_key(node_without_ext)
|
|
1345
|
+
page_info = self.page_lookup.get(ext_fallback_key)
|
|
1346
|
+
if page_info:
|
|
1347
|
+
log.debug(
|
|
1348
|
+
f"✅ Found page using extension-stripped key '{ext_fallback_key}' for '{node}'"
|
|
1349
|
+
)
|
|
1350
|
+
|
|
1351
|
+
# Strategy 3: Try title-based fuzzy matching first, then fallback to key matching
|
|
1352
|
+
if not page_info:
|
|
1353
|
+
# Convert navigation entry to clean format for comparison
|
|
1354
|
+
node_clean = (
|
|
1355
|
+
node.replace(".md", "").replace("-", " ").replace("_", " ")
|
|
1356
|
+
)
|
|
1357
|
+
|
|
1358
|
+
# Strategy 3a: Priority title matching - direct comparison with page titles
|
|
1359
|
+
best_match = None
|
|
1360
|
+
best_similarity = 0.0
|
|
1361
|
+
|
|
1362
|
+
# First pass: Look for title matches with high priority
|
|
1363
|
+
for key, page_data in self.page_lookup.items():
|
|
1364
|
+
page_title = page_data.get("title", "")
|
|
1365
|
+
if not page_title:
|
|
1366
|
+
continue
|
|
1367
|
+
|
|
1368
|
+
# Calculate similarity between navigation entry and page title
|
|
1369
|
+
title_similarity = self.calculate_word_similarity(
|
|
1370
|
+
node_clean, page_title
|
|
1371
|
+
)
|
|
1372
|
+
|
|
1373
|
+
# Bonus for context matching - check if the page path contains folder context
|
|
1374
|
+
context_bonus = 0.0
|
|
1375
|
+
if len(path_stack) > 0:
|
|
1376
|
+
# Check if any words from the path stack appear in the page key or title
|
|
1377
|
+
path_context = (
|
|
1378
|
+
" ".join(path_stack)
|
|
1379
|
+
.lower()
|
|
1380
|
+
.replace("-", " ")
|
|
1381
|
+
.replace("_", " ")
|
|
1382
|
+
)
|
|
1383
|
+
path_words = set(path_context.split())
|
|
1384
|
+
|
|
1385
|
+
# Check page key for context words
|
|
1386
|
+
key_words = set(
|
|
1387
|
+
key.lower()
|
|
1388
|
+
.replace("-", " ")
|
|
1389
|
+
.replace("_", " ")
|
|
1390
|
+
.split()
|
|
1391
|
+
)
|
|
1392
|
+
title_words = set(
|
|
1393
|
+
page_title.lower()
|
|
1394
|
+
.replace("-", " ")
|
|
1395
|
+
.replace("_", " ")
|
|
1396
|
+
.split()
|
|
1397
|
+
)
|
|
1398
|
+
|
|
1399
|
+
key_context_overlap = len(
|
|
1400
|
+
path_words.intersection(key_words)
|
|
1401
|
+
)
|
|
1402
|
+
title_context_overlap = len(
|
|
1403
|
+
path_words.intersection(title_words)
|
|
1404
|
+
)
|
|
1405
|
+
|
|
1406
|
+
if key_context_overlap > 0 or title_context_overlap > 0:
|
|
1407
|
+
context_bonus = min(
|
|
1408
|
+
0.2,
|
|
1409
|
+
(key_context_overlap + title_context_overlap)
|
|
1410
|
+
* 0.05,
|
|
1411
|
+
)
|
|
1412
|
+
|
|
1413
|
+
# Apply context bonus to title similarity
|
|
1414
|
+
adjusted_similarity = title_similarity + context_bonus
|
|
1415
|
+
|
|
1416
|
+
# Higher priority for title matches
|
|
1417
|
+
if (
|
|
1418
|
+
adjusted_similarity > best_similarity
|
|
1419
|
+
and adjusted_similarity >= 0.25
|
|
1420
|
+
):
|
|
1421
|
+
best_similarity = adjusted_similarity
|
|
1422
|
+
best_match = (
|
|
1423
|
+
key,
|
|
1424
|
+
page_data,
|
|
1425
|
+
"title",
|
|
1426
|
+
title_similarity,
|
|
1427
|
+
context_bonus,
|
|
1428
|
+
)
|
|
1429
|
+
|
|
1430
|
+
# Second pass: Only if no good title match, try key matching
|
|
1431
|
+
if (
|
|
1432
|
+
best_similarity < 0.4
|
|
1433
|
+
): # Only fallback to key matching if title match is poor
|
|
1434
|
+
for key, page_data in self.page_lookup.items():
|
|
1435
|
+
# Calculate similarity between navigation entry and lookup key
|
|
1436
|
+
key_similarity = self.calculate_word_similarity(
|
|
1437
|
+
node_clean, key.replace("-", " ")
|
|
1438
|
+
)
|
|
1439
|
+
|
|
1440
|
+
# Apply same context bonus logic for key matching
|
|
1441
|
+
context_bonus = 0.0
|
|
1442
|
+
if len(path_stack) > 0:
|
|
1443
|
+
path_context = (
|
|
1444
|
+
" ".join(path_stack)
|
|
1445
|
+
.lower()
|
|
1446
|
+
.replace("-", " ")
|
|
1447
|
+
.replace("_", " ")
|
|
1448
|
+
)
|
|
1449
|
+
path_words = set(path_context.split())
|
|
1450
|
+
key_words = set(
|
|
1451
|
+
key.lower()
|
|
1452
|
+
.replace("-", " ")
|
|
1453
|
+
.replace("_", " ")
|
|
1454
|
+
.split()
|
|
1455
|
+
)
|
|
1456
|
+
key_context_overlap = len(
|
|
1457
|
+
path_words.intersection(key_words)
|
|
1458
|
+
)
|
|
1459
|
+
if key_context_overlap > 0:
|
|
1460
|
+
context_bonus = min(
|
|
1461
|
+
0.2, key_context_overlap * 0.05
|
|
1462
|
+
)
|
|
1463
|
+
|
|
1464
|
+
adjusted_similarity = key_similarity + context_bonus
|
|
1465
|
+
|
|
1466
|
+
if (
|
|
1467
|
+
adjusted_similarity > best_similarity
|
|
1468
|
+
and adjusted_similarity >= 0.25
|
|
1469
|
+
):
|
|
1470
|
+
best_similarity = adjusted_similarity
|
|
1471
|
+
best_match = (
|
|
1472
|
+
key,
|
|
1473
|
+
page_data,
|
|
1474
|
+
"key",
|
|
1475
|
+
key_similarity,
|
|
1476
|
+
context_bonus,
|
|
1477
|
+
)
|
|
1478
|
+
|
|
1479
|
+
if best_match:
|
|
1480
|
+
page_info = best_match[1]
|
|
1481
|
+
match_type = best_match[2]
|
|
1482
|
+
base_similarity = best_match[3]
|
|
1483
|
+
context_bonus = best_match[4]
|
|
1484
|
+
log.debug(
|
|
1485
|
+
f"✅ Found page using {match_type} matching '{best_match[0]}' (similarity: {base_similarity:.3f} + context: {context_bonus:.3f} = {best_similarity:.3f}) for '{node}' in path {path_stack}"
|
|
1486
|
+
)
|
|
1487
|
+
|
|
1488
|
+
# Strategy 3b: Enhanced fuzzy matching as final fallback
|
|
1489
|
+
if not page_info:
|
|
1490
|
+
possible_keys = list(self.page_lookup.keys())
|
|
1491
|
+
matches = get_close_matches(
|
|
1492
|
+
lookup_key, possible_keys, n=10, cutoff=0.6
|
|
1493
|
+
)
|
|
1494
|
+
|
|
1495
|
+
# If node has .md extension, also try fuzzy matching without it
|
|
1496
|
+
if node.endswith(".md"):
|
|
1497
|
+
node_without_ext = node[:-3]
|
|
1498
|
+
ext_stripped_key = self.normalize_title_key(
|
|
1499
|
+
node_without_ext
|
|
1500
|
+
)
|
|
1501
|
+
ext_matches = get_close_matches(
|
|
1502
|
+
ext_stripped_key, possible_keys, n=10, cutoff=0.6
|
|
1503
|
+
)
|
|
1504
|
+
matches.extend(ext_matches)
|
|
1505
|
+
# Strategy 3c: Try traditional fuzzy matching on the results
|
|
1506
|
+
if not page_info:
|
|
1507
|
+
for match in matches:
|
|
1508
|
+
page_title = self.page_lookup[match].get(
|
|
1509
|
+
"title", ""
|
|
1510
|
+
)
|
|
1511
|
+
# More flexible title matching
|
|
1512
|
+
normalized_page_title = (
|
|
1513
|
+
page_title.lower()
|
|
1514
|
+
.replace(" ", "-")
|
|
1515
|
+
.replace("_", "-")
|
|
1516
|
+
)
|
|
1517
|
+
normalized_node = (
|
|
1518
|
+
node.lower()
|
|
1519
|
+
.replace(" ", "-")
|
|
1520
|
+
.replace("_", "-")
|
|
1521
|
+
.replace(".md", "")
|
|
1522
|
+
)
|
|
1523
|
+
|
|
1524
|
+
if (
|
|
1525
|
+
normalized_page_title == normalized_node
|
|
1526
|
+
or match
|
|
1527
|
+
== self.normalize_title_key(
|
|
1528
|
+
node_without_ext
|
|
1529
|
+
if node.endswith(".md")
|
|
1530
|
+
else node
|
|
1531
|
+
)
|
|
1532
|
+
):
|
|
1533
|
+
page_info = self.page_lookup[match]
|
|
1534
|
+
log.debug(
|
|
1535
|
+
f"✅ Found page using fuzzy match '{match}' for '{node}'"
|
|
1536
|
+
)
|
|
1537
|
+
break
|
|
1538
|
+
|
|
1539
|
+
if not page_info:
|
|
1540
|
+
log.warning(
|
|
1541
|
+
f"⚠️ No page data found for '{node}' → tried key '{lookup_key}' and fallback '{fallback_key}'"
|
|
1542
|
+
)
|
|
1543
|
+
log.debug(
|
|
1544
|
+
f"🔍 Best similarity was: {best_similarity:.3f} (threshold: 0.25)"
|
|
1545
|
+
)
|
|
1546
|
+
if (
|
|
1547
|
+
len(self.page_lookup) <= 20
|
|
1548
|
+
): # Only show all keys if there aren't too many
|
|
1549
|
+
log.debug(
|
|
1550
|
+
f"🔍 Available page_lookup keys: {list(self.page_lookup.keys())}"
|
|
1551
|
+
)
|
|
1552
|
+
else:
|
|
1553
|
+
log.debug(
|
|
1554
|
+
f"🔍 {len(self.page_lookup)} page_lookup keys available"
|
|
1555
|
+
)
|
|
1556
|
+
continue
|
|
1557
|
+
|
|
1558
|
+
# Mark this page as processed using the key that actually worked
|
|
1559
|
+
if page_info:
|
|
1560
|
+
# Figure out which key was actually used
|
|
1561
|
+
if lookup_key in self.page_lookup:
|
|
1562
|
+
processed_pages.add(lookup_key)
|
|
1563
|
+
else:
|
|
1564
|
+
fallback_key = self.normalize_title_key(node)
|
|
1565
|
+
if fallback_key in self.page_lookup:
|
|
1566
|
+
processed_pages.add(fallback_key)
|
|
1567
|
+
else:
|
|
1568
|
+
# Must have been found via fuzzy matching, find the actual key
|
|
1569
|
+
for key, info in self.page_lookup.items():
|
|
1570
|
+
if info == page_info:
|
|
1571
|
+
processed_pages.add(key)
|
|
1572
|
+
break
|
|
1573
|
+
|
|
1574
|
+
body = page_info.get("body", "")
|
|
1575
|
+
abs_src_path = page_info.get("abs_src_path")
|
|
1576
|
+
attachments = (
|
|
1577
|
+
self.attachments.get(abs_src_path, []) if abs_src_path else []
|
|
1578
|
+
)
|
|
1579
|
+
|
|
1580
|
+
page_id = self.create_or_update_page(
|
|
1581
|
+
title=page_info.get("title", node),
|
|
1582
|
+
body=body,
|
|
1583
|
+
parent_id=parent_id,
|
|
1584
|
+
attachments=attachments,
|
|
1585
|
+
abs_src_path=abs_src_path,
|
|
1586
|
+
)
|
|
1587
|
+
self.sync_page_attachments(page_id, attachments)
|
|
1588
|
+
|
|
1589
|
+
elif isinstance(node, dict):
|
|
1590
|
+
for folder, children in node.items():
|
|
1591
|
+
folder_title = folder
|
|
1592
|
+
path_stack_full = path_stack + [folder_title]
|
|
1593
|
+
folder_lookup_key = self.normalize_title_key(
|
|
1594
|
+
"/".join(path_stack_full)
|
|
1595
|
+
)
|
|
1596
|
+
|
|
1597
|
+
folder_page_info = self.page_lookup.get(
|
|
1598
|
+
folder_lookup_key,
|
|
1599
|
+
{
|
|
1600
|
+
"title": folder_title,
|
|
1601
|
+
"body": "",
|
|
1602
|
+
"is_folder": True,
|
|
1603
|
+
},
|
|
1604
|
+
)
|
|
1605
|
+
|
|
1606
|
+
# Mark folder as processed if it exists in page_lookup
|
|
1607
|
+
if folder_lookup_key in self.page_lookup:
|
|
1608
|
+
processed_pages.add(folder_lookup_key)
|
|
1609
|
+
else:
|
|
1610
|
+
# Try fallback for folders too
|
|
1611
|
+
fallback_folder_key = self.normalize_title_key(folder_title)
|
|
1612
|
+
if fallback_folder_key in self.page_lookup:
|
|
1613
|
+
folder_page_info = self.page_lookup[fallback_folder_key]
|
|
1614
|
+
processed_pages.add(fallback_folder_key)
|
|
1615
|
+
log.debug(
|
|
1616
|
+
f"✅ Found folder using fallback key '{fallback_folder_key}' for '{folder_title}'"
|
|
1617
|
+
)
|
|
1618
|
+
|
|
1619
|
+
folder_id = self.create_or_update_page(
|
|
1620
|
+
title=folder_page_info.get("title", folder_title),
|
|
1621
|
+
body=folder_page_info.get("body", ""),
|
|
1622
|
+
parent_id=parent_id,
|
|
1623
|
+
is_folder=folder_page_info.get("is_folder", True),
|
|
1624
|
+
)
|
|
1625
|
+
self.build_and_publish_tree(
|
|
1626
|
+
children,
|
|
1627
|
+
parent_id=folder_id,
|
|
1628
|
+
path_stack=path_stack_full,
|
|
1629
|
+
processed_pages=processed_pages,
|
|
1630
|
+
)
|
|
1631
|
+
|
|
1632
|
+
# Report orphan pages (only at the top level to avoid duplicate reporting)
|
|
1633
|
+
if not path_stack: # Only report orphans at the root level
|
|
1634
|
+
orphan_pages = set(self.page_lookup.keys()) - processed_pages
|
|
1635
|
+
for orphan_key in orphan_pages:
|
|
1636
|
+
orphan_info = self.page_lookup[orphan_key]
|
|
1637
|
+
orphan_title = orphan_info.get("title", orphan_key)
|
|
1638
|
+
log.info(
|
|
1639
|
+
f"📄 Orphan page found: '{orphan_title}' (not referenced in navigation)"
|
|
1640
|
+
)
|
|
1641
|
+
|
|
1642
|
+
def build_page_lookup(self):
|
|
1643
|
+
self.page_lookup = {}
|
|
1644
|
+
for page in self.pages:
|
|
1645
|
+
abs_path = page.get("abs_src_path")
|
|
1646
|
+
if not abs_path:
|
|
1647
|
+
continue
|
|
1648
|
+
rel_path = os.path.relpath(abs_path, "docs").replace("\\", "/")
|
|
1649
|
+
path_parts = rel_path.replace(".md", "").split("/")
|
|
1650
|
+
normalized_key = self.normalize_title_key("/".join(path_parts))
|
|
1651
|
+
self.page_lookup[normalized_key] = page
|
|
1652
|
+
|
|
1653
|
+
def debug_dump_page_parents(self):
|
|
1654
|
+
print("🔍 Page parent mapping:")
|
|
1655
|
+
for child, parent in self.page_parents.items():
|
|
1656
|
+
print(f" {child} ← {parent}")
|
|
1657
|
+
|
|
1658
|
+
def dryrun_log(self, action: str, title: str, parent_id=None):
|
|
1659
|
+
"""Log dry run actions with consistent formatting."""
|
|
1660
|
+
parent_info = f" under parent ID {parent_id}" if parent_id else ""
|
|
1661
|
+
# Ensure "page" is included in the action for test compatibility
|
|
1662
|
+
if (
|
|
1663
|
+
action.lower() in ["create", "update", "publish"]
|
|
1664
|
+
and "page" not in action.lower()
|
|
1665
|
+
):
|
|
1666
|
+
action = f"{action} page"
|
|
1667
|
+
log.info(f"DRYRUN: Would {action} '{title}'{parent_info}")
|
|
1668
|
+
|
|
1669
|
+
def _cache_key(self, title: str, parent_id) -> tuple:
|
|
1670
|
+
return (self._normalize_title(title), str(parent_id) if parent_id else None)
|
|
1671
|
+
|
|
1672
|
+
def get_file_sha1(self, file_path):
|
|
1673
|
+
hash_sha1 = hashlib.sha1()
|
|
1674
|
+
with open(file_path, "rb") as f:
|
|
1675
|
+
for chunk in iter(lambda: f.read(4096), b""):
|
|
1676
|
+
hash_sha1.update(chunk)
|
|
1677
|
+
return hash_sha1.hexdigest()
|