note-connector 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/paths.js +4 -0
- package/dist/setup-dependencies.js +56 -13
- package/package.json +3 -2
- package/py/pyproject.toml +86 -0
- package/py/src/note_mcp/__init__.py +7 -0
- package/py/src/note_mcp/__main__.py +65 -0
- package/py/src/note_mcp/api/__init__.py +31 -0
- package/py/src/note_mcp/api/articles.py +1395 -0
- package/py/src/note_mcp/api/client.py +318 -0
- package/py/src/note_mcp/api/embeds.py +482 -0
- package/py/src/note_mcp/api/images.py +456 -0
- package/py/src/note_mcp/api/preview.py +142 -0
- package/py/src/note_mcp/api/public_notes.py +150 -0
- package/py/src/note_mcp/auth/__init__.py +9 -0
- package/py/src/note_mcp/auth/browser.py +574 -0
- package/py/src/note_mcp/auth/file_session.py +145 -0
- package/py/src/note_mcp/auth/session.py +240 -0
- package/py/src/note_mcp/browser/__init__.py +10 -0
- package/py/src/note_mcp/browser/config.py +21 -0
- package/py/src/note_mcp/browser/manager.py +182 -0
- package/py/src/note_mcp/browser/preview.py +68 -0
- package/py/src/note_mcp/browser/url_helpers.py +18 -0
- package/py/src/note_mcp/chatgpt/__init__.py +1 -0
- package/py/src/note_mcp/chatgpt/__main__.py +63 -0
- package/py/src/note_mcp/chatgpt/access_log.py +25 -0
- package/py/src/note_mcp/chatgpt/auth.py +52 -0
- package/py/src/note_mcp/chatgpt/images.py +92 -0
- package/py/src/note_mcp/chatgpt/login_once.py +26 -0
- package/py/src/note_mcp/chatgpt/middleware.py +31 -0
- package/py/src/note_mcp/chatgpt/tools.py +255 -0
- package/py/src/note_mcp/chatgpt/widgets.py +121 -0
- package/py/src/note_mcp/decorators.py +113 -0
- package/py/src/note_mcp/investigator/__init__.py +33 -0
- package/py/src/note_mcp/investigator/__main__.py +11 -0
- package/py/src/note_mcp/investigator/cli.py +313 -0
- package/py/src/note_mcp/investigator/core.py +653 -0
- package/py/src/note_mcp/investigator/mcp_tools.py +225 -0
- package/py/src/note_mcp/models.py +557 -0
- package/py/src/note_mcp/py.typed +0 -0
- package/py/src/note_mcp/server.py +905 -0
- package/py/src/note_mcp/utils/__init__.py +7 -0
- package/py/src/note_mcp/utils/file_parser.py +314 -0
- package/py/src/note_mcp/utils/html_to_markdown.py +477 -0
- package/py/src/note_mcp/utils/logging.py +119 -0
- package/py/src/note_mcp/utils/markdown.py +12 -0
- package/py/src/note_mcp/utils/markdown_to_html.py +826 -0
|
@@ -0,0 +1,477 @@
|
|
|
1
|
+
"""HTML to Markdown conversion utility.
|
|
2
|
+
|
|
3
|
+
Converts note.com HTML format (ProseMirror) back to Markdown.
|
|
4
|
+
This is the reverse operation of markdown_to_html.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import html
|
|
8
|
+
import re
|
|
9
|
+
from collections.abc import Callable
|
|
10
|
+
|
|
11
|
+
# Pre-compiled regex patterns for basic elements
|
|
12
|
+
# Match <pre><code>...</code></pre> with or without class="codeBlock"
|
|
13
|
+
_CODE_BLOCK_PATTERN = re.compile(
|
|
14
|
+
r"<pre[^>]*><code>(.*?)</code></pre>",
|
|
15
|
+
re.DOTALL | re.IGNORECASE,
|
|
16
|
+
)
|
|
17
|
+
# Match <pre>...</pre> without <code> tag (fallback for some note.com formats)
|
|
18
|
+
_PRE_ONLY_PATTERN = re.compile(
|
|
19
|
+
r"<pre[^>]*>(?!<code>)(.*?)</pre>",
|
|
20
|
+
re.DOTALL | re.IGNORECASE,
|
|
21
|
+
)
|
|
22
|
+
_HEADING_PATTERN = re.compile(
|
|
23
|
+
r"<(h[1-6])[^>]*>(.*?)</\1>",
|
|
24
|
+
re.IGNORECASE | re.DOTALL,
|
|
25
|
+
)
|
|
26
|
+
_PARAGRAPH_PATTERN = re.compile(
|
|
27
|
+
r"<p[^>]*>(.*?)</p>",
|
|
28
|
+
re.IGNORECASE | re.DOTALL,
|
|
29
|
+
)
|
|
30
|
+
_HR_PATTERN = re.compile(r"<hr[^>]*/?>", re.IGNORECASE)
|
|
31
|
+
|
|
32
|
+
# Patterns for complex elements
|
|
33
|
+
_BLOCKQUOTE_FIGURE_PATTERN = re.compile(
|
|
34
|
+
r"<figure[^>]*>\s*<blockquote[^>]*>(.*?)</blockquote>\s*"
|
|
35
|
+
r"<figcaption>(.*?)</figcaption>\s*</figure>",
|
|
36
|
+
re.DOTALL | re.IGNORECASE,
|
|
37
|
+
)
|
|
38
|
+
_BR_PATTERN = re.compile(r"<br\s*/?>", re.IGNORECASE)
|
|
39
|
+
_FIGCAPTION_LINK_PATTERN = re.compile(
|
|
40
|
+
r'<a\s+href="([^"]+)"[^>]*>([^<]+)</a>',
|
|
41
|
+
re.IGNORECASE,
|
|
42
|
+
)
|
|
43
|
+
_IMAGE_FIGURE_PATTERN = re.compile(
|
|
44
|
+
r'<figure[^>]*>\s*<img[^>]*src="([^"]+)"[^>]*alt="([^"]*)"[^>]*>\s*'
|
|
45
|
+
r"<figcaption>(.*?)</figcaption>\s*</figure>",
|
|
46
|
+
re.DOTALL | re.IGNORECASE,
|
|
47
|
+
)
|
|
48
|
+
# Alternative pattern for img with alt before src
|
|
49
|
+
_IMAGE_FIGURE_PATTERN_ALT = re.compile(
|
|
50
|
+
r'<figure[^>]*>\s*<img[^>]*alt="([^"]*)"[^>]*src="([^"]+)"[^>]*>\s*'
|
|
51
|
+
r"<figcaption>(.*?)</figcaption>\s*</figure>",
|
|
52
|
+
re.DOTALL | re.IGNORECASE,
|
|
53
|
+
)
|
|
54
|
+
_UL_PATTERN = re.compile(r"<ul[^>]*>(.*?)</ul>", re.DOTALL | re.IGNORECASE)
|
|
55
|
+
_OL_PATTERN = re.compile(r"<ol[^>]*>(.*?)</ol>", re.DOTALL | re.IGNORECASE)
|
|
56
|
+
|
|
57
|
+
# Patterns for inline elements
|
|
58
|
+
_LINK_PATTERN = re.compile(
|
|
59
|
+
r'<a\s+href="([^"]+)"[^>]*>(.*?)</a>',
|
|
60
|
+
re.DOTALL | re.IGNORECASE,
|
|
61
|
+
)
|
|
62
|
+
_STRONG_PATTERN = re.compile(r"<strong>(.*?)</strong>", re.DOTALL | re.IGNORECASE)
|
|
63
|
+
_EM_PATTERN = re.compile(r"<em>(.*?)</em>", re.DOTALL | re.IGNORECASE)
|
|
64
|
+
_INLINE_CODE_PATTERN = re.compile(r"<code>(.*?)</code>", re.DOTALL | re.IGNORECASE)
|
|
65
|
+
_STRIKETHROUGH_PATTERN = re.compile(r"<s>(.*?)</s>", re.DOTALL | re.IGNORECASE)
|
|
66
|
+
|
|
67
|
+
# TOC element pattern (note.com uses TableOfContents class)
|
|
68
|
+
# Match elements with class containing "TableOfContents"
|
|
69
|
+
_TOC_ELEMENT_PATTERN = re.compile(
|
|
70
|
+
r'<[^>]*class="[^"]*TableOfContents[^"]*"[^>]*>.*?</(?:div|section|nav)>',
|
|
71
|
+
re.DOTALL | re.IGNORECASE,
|
|
72
|
+
)
|
|
73
|
+
# Also match self-closing or empty TOC elements
|
|
74
|
+
_TOC_ELEMENT_SIMPLE_PATTERN = re.compile(
|
|
75
|
+
r'<[^>]*class="[^"]*TableOfContents[^"]*"[^>]*/?>',
|
|
76
|
+
re.IGNORECASE,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# Text alignment pattern (Issue #40)
|
|
80
|
+
# Match <p ... style="text-align: center/right/left" ...> with possible other style properties
|
|
81
|
+
_TEXT_ALIGN_P_PATTERN = re.compile(
|
|
82
|
+
r'<p([^>]*style="[^"]*text-align:\s*(center|right|left)[^"]*"[^>]*)>(.*?)</p>',
|
|
83
|
+
re.DOTALL | re.IGNORECASE,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Cleanup patterns
|
|
87
|
+
_UUID_ATTR_PATTERN = re.compile(
|
|
88
|
+
r'\s(?:name|id)="[a-f0-9-]{36}"',
|
|
89
|
+
re.IGNORECASE,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _strip_fence_markers(code: str) -> str:
|
|
94
|
+
"""Strip fence markers from code block content.
|
|
95
|
+
|
|
96
|
+
Handles various formats:
|
|
97
|
+
- ```python\\ncode\\n```
|
|
98
|
+
- ```\\ncode\\n```
|
|
99
|
+
- code with fence markers at boundaries
|
|
100
|
+
"""
|
|
101
|
+
# Remove opening fence marker (``` or ```language)
|
|
102
|
+
if code.startswith("```"):
|
|
103
|
+
# Find the end of the first line (after language identifier)
|
|
104
|
+
newline_pos = code.find("\n")
|
|
105
|
+
if newline_pos != -1:
|
|
106
|
+
code = code[newline_pos + 1 :]
|
|
107
|
+
else:
|
|
108
|
+
# No newline, remove just the opening ``` and optional language
|
|
109
|
+
first_word_end = 3 # Skip ```
|
|
110
|
+
while first_word_end < len(code) and code[first_word_end].isalnum():
|
|
111
|
+
first_word_end += 1
|
|
112
|
+
code = code[first_word_end:]
|
|
113
|
+
|
|
114
|
+
# Remove closing fence marker (```)
|
|
115
|
+
code = code.rstrip()
|
|
116
|
+
if code.endswith("```"):
|
|
117
|
+
code = code[:-3]
|
|
118
|
+
|
|
119
|
+
return code.strip()
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _create_code_block_extractor(code_blocks: list[str]) -> Callable[[re.Match[str]], str]:
|
|
123
|
+
"""Create a code block extractor closure with local storage.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
code_blocks: List to store extracted code blocks (mutated in place)
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
A function that extracts code blocks and returns placeholders
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
def extract_code_block(match: re.Match[str]) -> str:
|
|
133
|
+
"""Extract code block and replace with placeholder."""
|
|
134
|
+
code = match.group(1)
|
|
135
|
+
code = html.unescape(code)
|
|
136
|
+
# Remove any remaining fence markers (``` at start/end)
|
|
137
|
+
code = _strip_fence_markers(code)
|
|
138
|
+
# Include trailing newlines for proper paragraph separation
|
|
139
|
+
block = f"```\n{code}\n```\n\n"
|
|
140
|
+
code_blocks.append(block)
|
|
141
|
+
return f"__CODE_BLOCK_{len(code_blocks) - 1}__"
|
|
142
|
+
|
|
143
|
+
return extract_code_block
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _convert_text_align_paragraph(match: re.Match[str]) -> str:
|
|
147
|
+
"""Convert text-aligned paragraph to Markdown format with alignment markers.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
match: Regex match with groups (attrs, alignment, content)
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Markdown with alignment markers:
|
|
154
|
+
- center: ->text<-
|
|
155
|
+
- right: ->text
|
|
156
|
+
- left: <-text
|
|
157
|
+
"""
|
|
158
|
+
alignment = match.group(2).lower()
|
|
159
|
+
content = match.group(3).strip()
|
|
160
|
+
|
|
161
|
+
alignment_formats = {
|
|
162
|
+
"center": f"->{content}<-\n\n",
|
|
163
|
+
"right": f"->{content}\n\n",
|
|
164
|
+
"left": f"<-{content}\n\n",
|
|
165
|
+
}
|
|
166
|
+
return alignment_formats.get(alignment, f"{content}\n\n")
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _convert_heading(match: re.Match[str]) -> str:
|
|
170
|
+
"""Convert heading to Markdown format."""
|
|
171
|
+
level = int(match.group(1)[1]) # h1 -> 1, h2 -> 2, etc.
|
|
172
|
+
text = match.group(2).strip()
|
|
173
|
+
return f"{'#' * level} {text}\n\n"
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _convert_paragraph(match: re.Match[str]) -> str:
|
|
177
|
+
"""Convert paragraph to Markdown format."""
|
|
178
|
+
content = match.group(1).strip()
|
|
179
|
+
return f"{content}\n\n"
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _convert_blockquote_figure(match: re.Match[str]) -> str:
|
|
183
|
+
"""Convert blockquote figure to Markdown format."""
|
|
184
|
+
content = match.group(1)
|
|
185
|
+
figcaption = match.group(2).strip()
|
|
186
|
+
|
|
187
|
+
# Remove <p> tags from content
|
|
188
|
+
content = re.sub(r"<p[^>]*>(.*?)</p>", r"\1", content, flags=re.DOTALL | re.IGNORECASE)
|
|
189
|
+
|
|
190
|
+
# Convert <br> to newlines
|
|
191
|
+
content = _BR_PATTERN.sub("\n", content)
|
|
192
|
+
|
|
193
|
+
# Build blockquote lines
|
|
194
|
+
lines = content.strip().split("\n")
|
|
195
|
+
quote_lines = [f"> {line.strip()}" for line in lines if line.strip()]
|
|
196
|
+
|
|
197
|
+
# Add citation if present
|
|
198
|
+
if figcaption:
|
|
199
|
+
# Check for link in figcaption
|
|
200
|
+
link_match = _FIGCAPTION_LINK_PATTERN.search(figcaption)
|
|
201
|
+
if link_match:
|
|
202
|
+
url = link_match.group(1)
|
|
203
|
+
text = link_match.group(2)
|
|
204
|
+
quote_lines.append(f"> — {text} ({url})")
|
|
205
|
+
else:
|
|
206
|
+
quote_lines.append(f"> — {figcaption}")
|
|
207
|
+
|
|
208
|
+
return "\n".join(quote_lines) + "\n\n"
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _convert_image_figure(match: re.Match[str], alt_first: bool = False) -> str:
|
|
212
|
+
"""Convert image figure to Markdown format."""
|
|
213
|
+
if alt_first:
|
|
214
|
+
alt = match.group(1)
|
|
215
|
+
src = match.group(2)
|
|
216
|
+
caption = match.group(3).strip()
|
|
217
|
+
else:
|
|
218
|
+
src = match.group(1)
|
|
219
|
+
alt = match.group(2)
|
|
220
|
+
caption = match.group(3).strip()
|
|
221
|
+
|
|
222
|
+
if caption:
|
|
223
|
+
return f'\n\n'
|
|
224
|
+
return f"\n\n"
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _find_matching_tags(
|
|
228
|
+
html_content: str,
|
|
229
|
+
tag_name: str,
|
|
230
|
+
) -> list[tuple[str, int, int]]:
|
|
231
|
+
"""Find all top-level matching tag pairs with proper nesting support.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
html_content: HTML string to search
|
|
235
|
+
tag_name: Tag name to find (e.g., "li", "ul")
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
List of (content, start_pos, end_pos) tuples for each match
|
|
239
|
+
"""
|
|
240
|
+
results: list[tuple[str, int, int]] = []
|
|
241
|
+
open_tag = f"<{tag_name}"
|
|
242
|
+
close_tag = f"</{tag_name}>"
|
|
243
|
+
pos = 0
|
|
244
|
+
|
|
245
|
+
while pos < len(html_content):
|
|
246
|
+
# Find opening tag
|
|
247
|
+
tag_start = html_content.find(open_tag, pos)
|
|
248
|
+
if tag_start == -1:
|
|
249
|
+
break
|
|
250
|
+
|
|
251
|
+
# Find the > that closes the opening tag
|
|
252
|
+
tag_end = html_content.find(">", tag_start)
|
|
253
|
+
if tag_end == -1:
|
|
254
|
+
break
|
|
255
|
+
|
|
256
|
+
# Track depth to find matching close tag
|
|
257
|
+
depth = 1
|
|
258
|
+
search_pos = tag_end + 1
|
|
259
|
+
|
|
260
|
+
while depth > 0 and search_pos < len(html_content):
|
|
261
|
+
next_open = html_content.find(open_tag, search_pos)
|
|
262
|
+
next_close = html_content.find(close_tag, search_pos)
|
|
263
|
+
|
|
264
|
+
if next_close == -1:
|
|
265
|
+
break
|
|
266
|
+
|
|
267
|
+
if next_open != -1 and next_open < next_close:
|
|
268
|
+
depth += 1
|
|
269
|
+
search_pos = next_open + len(open_tag)
|
|
270
|
+
else:
|
|
271
|
+
depth -= 1
|
|
272
|
+
if depth == 0:
|
|
273
|
+
content = html_content[tag_end + 1 : next_close]
|
|
274
|
+
results.append((content, tag_start, next_close + len(close_tag)))
|
|
275
|
+
search_pos = next_close + len(close_tag)
|
|
276
|
+
|
|
277
|
+
pos = search_pos
|
|
278
|
+
|
|
279
|
+
return results
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _find_matching_li_tags(html_content: str) -> list[str]:
|
|
283
|
+
"""Find all top-level <li> elements, properly handling nested lists."""
|
|
284
|
+
return [content for content, _, _ in _find_matching_tags(html_content, "li")]
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def _convert_list(html_content: str, ordered: bool = False, indent_level: int = 0) -> str:
|
|
288
|
+
"""Convert list to Markdown format with nested list support.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
html_content: HTML content of list
|
|
292
|
+
ordered: True for ordered list, False for unordered
|
|
293
|
+
indent_level: Current indentation level (0 = top level)
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
Markdown formatted list
|
|
297
|
+
"""
|
|
298
|
+
indent = " " * indent_level # 2 spaces per level
|
|
299
|
+
lines: list[str] = []
|
|
300
|
+
counter = 1
|
|
301
|
+
|
|
302
|
+
# Use proper tag matching instead of regex
|
|
303
|
+
li_contents = _find_matching_li_tags(html_content)
|
|
304
|
+
|
|
305
|
+
for li_content in li_contents:
|
|
306
|
+
# Extract text from first <p> tag if present (before any nested lists)
|
|
307
|
+
p_match = re.search(r"<p[^>]*>(.*?)</p>", li_content, re.DOTALL | re.IGNORECASE)
|
|
308
|
+
if p_match:
|
|
309
|
+
text = p_match.group(1).strip()
|
|
310
|
+
else:
|
|
311
|
+
# Remove any nested lists before extracting text
|
|
312
|
+
text = _UL_PATTERN.sub("", li_content)
|
|
313
|
+
text = _OL_PATTERN.sub("", text)
|
|
314
|
+
text = text.strip()
|
|
315
|
+
|
|
316
|
+
# Clean up any remaining HTML tags from text
|
|
317
|
+
text = re.sub(r"<[^>]+>", "", text).strip()
|
|
318
|
+
|
|
319
|
+
# Add list item
|
|
320
|
+
if text: # Only add if there's text content
|
|
321
|
+
if ordered:
|
|
322
|
+
lines.append(f"{indent}{counter}. {text}")
|
|
323
|
+
counter += 1
|
|
324
|
+
else:
|
|
325
|
+
lines.append(f"{indent}- {text}")
|
|
326
|
+
|
|
327
|
+
# Process nested lists
|
|
328
|
+
nested_ul = _UL_PATTERN.search(li_content)
|
|
329
|
+
nested_ol = _OL_PATTERN.search(li_content)
|
|
330
|
+
if nested_ul:
|
|
331
|
+
nested_md = _convert_list(nested_ul.group(1), ordered=False, indent_level=indent_level + 1)
|
|
332
|
+
lines.append(nested_md.rstrip())
|
|
333
|
+
if nested_ol:
|
|
334
|
+
nested_md = _convert_list(nested_ol.group(1), ordered=True, indent_level=indent_level + 1)
|
|
335
|
+
lines.append(nested_md.rstrip())
|
|
336
|
+
|
|
337
|
+
return "\n".join(lines) + "\n"
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def _find_matching_tag_content(html_content: str, tag_name: str) -> tuple[str, int, int] | None:
|
|
341
|
+
"""Find the content of a tag, properly handling nested same-name tags.
|
|
342
|
+
|
|
343
|
+
Returns (content, start_pos, end_pos) or None if not found.
|
|
344
|
+
"""
|
|
345
|
+
results = _find_matching_tags(html_content, tag_name)
|
|
346
|
+
return results[0] if results else None
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def _convert_all_lists(html_content: str) -> str:
|
|
350
|
+
"""Convert all lists in the HTML content, properly handling nesting."""
|
|
351
|
+
result = html_content
|
|
352
|
+
|
|
353
|
+
# Process lists repeatedly until no more are found
|
|
354
|
+
# We process from innermost to outermost by repeatedly finding and replacing
|
|
355
|
+
max_iterations = 100 # Prevent infinite loops
|
|
356
|
+
for _ in range(max_iterations):
|
|
357
|
+
# Try to find a ul or ol
|
|
358
|
+
ul_match = _find_matching_tag_content(result, "ul")
|
|
359
|
+
ol_match = _find_matching_tag_content(result, "ol")
|
|
360
|
+
|
|
361
|
+
# Find which one comes first
|
|
362
|
+
if ul_match is None and ol_match is None:
|
|
363
|
+
break
|
|
364
|
+
|
|
365
|
+
if ul_match is not None and (ol_match is None or ul_match[1] < ol_match[1]):
|
|
366
|
+
# Process ul
|
|
367
|
+
content, start, end = ul_match
|
|
368
|
+
md = _convert_list(content, ordered=False)
|
|
369
|
+
result = result[:start] + md + result[end:]
|
|
370
|
+
elif ol_match is not None:
|
|
371
|
+
# Process ol
|
|
372
|
+
content, start, end = ol_match
|
|
373
|
+
md = _convert_list(content, ordered=True)
|
|
374
|
+
result = result[:start] + md + result[end:]
|
|
375
|
+
|
|
376
|
+
return result
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def _convert_link(match: re.Match[str]) -> str:
|
|
380
|
+
"""Convert link to Markdown format."""
|
|
381
|
+
url = match.group(1)
|
|
382
|
+
text = match.group(2).strip()
|
|
383
|
+
return f"[{text}]({url})"
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def _convert_inline_elements(text: str) -> str:
|
|
387
|
+
"""Convert inline elements to Markdown format."""
|
|
388
|
+
result = text
|
|
389
|
+
|
|
390
|
+
# Links
|
|
391
|
+
result = _LINK_PATTERN.sub(_convert_link, result)
|
|
392
|
+
|
|
393
|
+
# Bold
|
|
394
|
+
result = _STRONG_PATTERN.sub(r"**\1**", result)
|
|
395
|
+
|
|
396
|
+
# Italic
|
|
397
|
+
result = _EM_PATTERN.sub(r"*\1*", result)
|
|
398
|
+
|
|
399
|
+
# Strikethrough
|
|
400
|
+
result = _STRIKETHROUGH_PATTERN.sub(r"~~\1~~", result)
|
|
401
|
+
|
|
402
|
+
# Inline code (must be after code block extraction to avoid false matches)
|
|
403
|
+
result = _INLINE_CODE_PATTERN.sub(r"`\1`", result)
|
|
404
|
+
|
|
405
|
+
return result
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def html_to_markdown(html_content: str) -> str:
|
|
409
|
+
"""Convert note.com HTML to Markdown.
|
|
410
|
+
|
|
411
|
+
Args:
|
|
412
|
+
html_content: HTML string from note.com editor (ProseMirror format)
|
|
413
|
+
|
|
414
|
+
Returns:
|
|
415
|
+
Markdown formatted text
|
|
416
|
+
"""
|
|
417
|
+
if not html_content or not html_content.strip():
|
|
418
|
+
return ""
|
|
419
|
+
|
|
420
|
+
# Use local storage for code blocks (thread-safe)
|
|
421
|
+
code_blocks: list[str] = []
|
|
422
|
+
extract_code_block = _create_code_block_extractor(code_blocks)
|
|
423
|
+
|
|
424
|
+
result = html_content
|
|
425
|
+
|
|
426
|
+
# 1. コードブロック(プレースホルダーで保護)
|
|
427
|
+
result = _CODE_BLOCK_PATTERN.sub(extract_code_block, result)
|
|
428
|
+
# Also handle <pre> without <code> tag (some note.com formats)
|
|
429
|
+
result = _PRE_ONLY_PATTERN.sub(extract_code_block, result)
|
|
430
|
+
|
|
431
|
+
# 2. TOC要素を[TOC]マーカーに変換
|
|
432
|
+
result = _TOC_ELEMENT_PATTERN.sub("[TOC]\n\n", result)
|
|
433
|
+
result = _TOC_ELEMENT_SIMPLE_PATTERN.sub("[TOC]\n\n", result)
|
|
434
|
+
|
|
435
|
+
# 3. figure要素(blockquoteとimageを先に処理)
|
|
436
|
+
result = _BLOCKQUOTE_FIGURE_PATTERN.sub(_convert_blockquote_figure, result)
|
|
437
|
+
result = _IMAGE_FIGURE_PATTERN.sub(lambda m: _convert_image_figure(m, alt_first=False), result)
|
|
438
|
+
result = _IMAGE_FIGURE_PATTERN_ALT.sub(lambda m: _convert_image_figure(m, alt_first=True), result)
|
|
439
|
+
|
|
440
|
+
# 4. 見出し
|
|
441
|
+
result = _HEADING_PATTERN.sub(_convert_heading, result)
|
|
442
|
+
|
|
443
|
+
# 5. リスト(ネスト対応 - 適切なタグマッチングを使用)
|
|
444
|
+
result = _convert_all_lists(result)
|
|
445
|
+
|
|
446
|
+
# 6. 水平線
|
|
447
|
+
result = _HR_PATTERN.sub("\n---\n\n", result)
|
|
448
|
+
|
|
449
|
+
# 7. インライン要素(リンク、太字、斜体、インラインコード)
|
|
450
|
+
result = _convert_inline_elements(result)
|
|
451
|
+
|
|
452
|
+
# 8. テキスト配置を持つ段落(通常の段落変換より先に処理)
|
|
453
|
+
result = _TEXT_ALIGN_P_PATTERN.sub(_convert_text_align_paragraph, result)
|
|
454
|
+
|
|
455
|
+
# 9. 段落(他の要素処理後に適用)
|
|
456
|
+
result = _PARAGRAPH_PATTERN.sub(_convert_paragraph, result)
|
|
457
|
+
|
|
458
|
+
# === 最終処理 ===
|
|
459
|
+
|
|
460
|
+
# プレースホルダー復元(コードブロック)
|
|
461
|
+
for i, block in enumerate(code_blocks):
|
|
462
|
+
result = result.replace(f"__CODE_BLOCK_{i}__", block)
|
|
463
|
+
|
|
464
|
+
# UUID属性削除(残存する場合のクリーンアップ)
|
|
465
|
+
result = _UUID_ATTR_PATTERN.sub("", result)
|
|
466
|
+
|
|
467
|
+
# 残存するHTMLタグを削除(エンティティデコード前に実行)
|
|
468
|
+
# これにより、ユーザーコンテンツ内の <tag> が保護される
|
|
469
|
+
result = re.sub(r"<[^>]+>", "", result)
|
|
470
|
+
|
|
471
|
+
# HTMLエンティティデコード
|
|
472
|
+
result = html.unescape(result)
|
|
473
|
+
|
|
474
|
+
# 連続する空行を正規化
|
|
475
|
+
result = re.sub(r"\n{3,}", "\n\n", result)
|
|
476
|
+
|
|
477
|
+
return result.strip()
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Secure logging configuration for note-mcp.
|
|
2
|
+
|
|
3
|
+
Provides logging setup with cookie value masking for security.
|
|
4
|
+
Cookie values are completely masked in all log output.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CookieMaskingFilter(logging.Filter):
|
|
12
|
+
"""Logging filter that masks cookie values for security.
|
|
13
|
+
|
|
14
|
+
All cookie values are replaced with [MASKED] to prevent
|
|
15
|
+
credential leakage in logs.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
# Patterns to match cookie values in various formats
|
|
19
|
+
COOKIE_PATTERNS = [
|
|
20
|
+
# Match note_gql_auth_token=VALUE or _note_session_v5=VALUE
|
|
21
|
+
re.compile(r"(note_gql_auth_token|_note_session_v5)[=:]\s*([^\s;,}\"']+)"),
|
|
22
|
+
# Match cookie dict format {"name": "value"}
|
|
23
|
+
re.compile(r'(["\']?(?:note_gql_auth_token|_note_session_v5)["\']?\s*[=:]\s*["\'])([^"\']+)(["\'])'),
|
|
24
|
+
# Match Cookie header format
|
|
25
|
+
re.compile(r"(Cookie:\s*[^;]*?(?:note_gql_auth_token|_note_session_v5)=)([^;\s]+)"),
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
def filter(self, record: logging.LogRecord) -> bool:
|
|
29
|
+
"""Filter and mask cookie values in log records.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
record: Log record to process
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Always True (record is always passed through, just modified)
|
|
36
|
+
"""
|
|
37
|
+
if record.msg:
|
|
38
|
+
record.msg = self._mask_cookies(str(record.msg))
|
|
39
|
+
if record.args:
|
|
40
|
+
# Handle args that might contain sensitive data
|
|
41
|
+
new_args: list[object] = []
|
|
42
|
+
for arg in record.args:
|
|
43
|
+
if isinstance(arg, str):
|
|
44
|
+
new_args.append(self._mask_cookies(arg))
|
|
45
|
+
else:
|
|
46
|
+
new_args.append(arg)
|
|
47
|
+
record.args = tuple(new_args)
|
|
48
|
+
return True
|
|
49
|
+
|
|
50
|
+
def _mask_cookies(self, text: str) -> str:
|
|
51
|
+
"""Mask all cookie values in text.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
text: Text potentially containing cookie values
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Text with cookie values replaced by [MASKED]
|
|
58
|
+
"""
|
|
59
|
+
result = text
|
|
60
|
+
for pattern in self.COOKIE_PATTERNS:
|
|
61
|
+
# Replace the value part (group 2) with [MASKED]
|
|
62
|
+
def mask_match(m: re.Match[str]) -> str:
|
|
63
|
+
suffix = m.group(3) if len(m.groups()) > 2 else ""
|
|
64
|
+
return m.group(1) + "[MASKED]" + suffix
|
|
65
|
+
|
|
66
|
+
result = pattern.sub(mask_match, result)
|
|
67
|
+
return result
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def setup_logging(level: int = logging.INFO, name: str | None = None) -> logging.Logger:
|
|
71
|
+
"""Set up logging with cookie masking.
|
|
72
|
+
|
|
73
|
+
Configures a logger with the CookieMaskingFilter to prevent
|
|
74
|
+
credential leakage in log output.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
level: Logging level (default: INFO)
|
|
78
|
+
name: Logger name (default: "note_mcp")
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Configured logger instance
|
|
82
|
+
"""
|
|
83
|
+
logger_name = name or "note_mcp"
|
|
84
|
+
logger = logging.getLogger(logger_name)
|
|
85
|
+
logger.setLevel(level)
|
|
86
|
+
|
|
87
|
+
# Remove existing handlers to avoid duplicates
|
|
88
|
+
for handler in logger.handlers[:]:
|
|
89
|
+
logger.removeHandler(handler)
|
|
90
|
+
|
|
91
|
+
# Create console handler with formatting
|
|
92
|
+
handler = logging.StreamHandler()
|
|
93
|
+
handler.setLevel(level)
|
|
94
|
+
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
|
95
|
+
handler.setFormatter(formatter)
|
|
96
|
+
|
|
97
|
+
# Add cookie masking filter
|
|
98
|
+
handler.addFilter(CookieMaskingFilter())
|
|
99
|
+
|
|
100
|
+
logger.addHandler(handler)
|
|
101
|
+
|
|
102
|
+
return logger
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def get_logger(name: str | None = None) -> logging.Logger:
|
|
106
|
+
"""Get a logger instance with cookie masking.
|
|
107
|
+
|
|
108
|
+
Gets or creates a child logger under the note_mcp namespace.
|
|
109
|
+
All loggers created this way inherit the cookie masking filter.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
name: Logger name suffix (e.g., "api" for "note_mcp.api")
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
Logger instance
|
|
116
|
+
"""
|
|
117
|
+
if name:
|
|
118
|
+
return logging.getLogger(f"note_mcp.{name}")
|
|
119
|
+
return logging.getLogger("note_mcp")
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Backward compatibility alias for markdown modules.
|
|
2
|
+
|
|
3
|
+
This module re-exports functions for backward compatibility.
|
|
4
|
+
New code should import directly from:
|
|
5
|
+
- note_mcp.utils.markdown_to_html
|
|
6
|
+
- note_mcp.utils.html_to_markdown
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from note_mcp.utils.html_to_markdown import html_to_markdown
|
|
10
|
+
from note_mcp.utils.markdown_to_html import markdown_to_html
|
|
11
|
+
|
|
12
|
+
__all__ = ["html_to_markdown", "markdown_to_html"]
|