note-connector 0.2.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/dist/paths.js +4 -0
  2. package/dist/setup-dependencies.js +56 -13
  3. package/package.json +3 -2
  4. package/py/pyproject.toml +86 -0
  5. package/py/src/note_mcp/__init__.py +7 -0
  6. package/py/src/note_mcp/__main__.py +65 -0
  7. package/py/src/note_mcp/api/__init__.py +31 -0
  8. package/py/src/note_mcp/api/articles.py +1395 -0
  9. package/py/src/note_mcp/api/client.py +318 -0
  10. package/py/src/note_mcp/api/embeds.py +482 -0
  11. package/py/src/note_mcp/api/images.py +456 -0
  12. package/py/src/note_mcp/api/preview.py +142 -0
  13. package/py/src/note_mcp/api/public_notes.py +150 -0
  14. package/py/src/note_mcp/auth/__init__.py +9 -0
  15. package/py/src/note_mcp/auth/browser.py +574 -0
  16. package/py/src/note_mcp/auth/file_session.py +145 -0
  17. package/py/src/note_mcp/auth/session.py +240 -0
  18. package/py/src/note_mcp/browser/__init__.py +10 -0
  19. package/py/src/note_mcp/browser/config.py +21 -0
  20. package/py/src/note_mcp/browser/manager.py +182 -0
  21. package/py/src/note_mcp/browser/preview.py +68 -0
  22. package/py/src/note_mcp/browser/url_helpers.py +18 -0
  23. package/py/src/note_mcp/chatgpt/__init__.py +1 -0
  24. package/py/src/note_mcp/chatgpt/__main__.py +63 -0
  25. package/py/src/note_mcp/chatgpt/access_log.py +25 -0
  26. package/py/src/note_mcp/chatgpt/auth.py +52 -0
  27. package/py/src/note_mcp/chatgpt/images.py +92 -0
  28. package/py/src/note_mcp/chatgpt/login_once.py +26 -0
  29. package/py/src/note_mcp/chatgpt/middleware.py +31 -0
  30. package/py/src/note_mcp/chatgpt/tools.py +255 -0
  31. package/py/src/note_mcp/chatgpt/widgets.py +121 -0
  32. package/py/src/note_mcp/decorators.py +113 -0
  33. package/py/src/note_mcp/investigator/__init__.py +33 -0
  34. package/py/src/note_mcp/investigator/__main__.py +11 -0
  35. package/py/src/note_mcp/investigator/cli.py +313 -0
  36. package/py/src/note_mcp/investigator/core.py +653 -0
  37. package/py/src/note_mcp/investigator/mcp_tools.py +225 -0
  38. package/py/src/note_mcp/models.py +557 -0
  39. package/py/src/note_mcp/py.typed +0 -0
  40. package/py/src/note_mcp/server.py +905 -0
  41. package/py/src/note_mcp/utils/__init__.py +7 -0
  42. package/py/src/note_mcp/utils/file_parser.py +314 -0
  43. package/py/src/note_mcp/utils/html_to_markdown.py +477 -0
  44. package/py/src/note_mcp/utils/logging.py +119 -0
  45. package/py/src/note_mcp/utils/markdown.py +12 -0
  46. package/py/src/note_mcp/utils/markdown_to_html.py +826 -0
@@ -0,0 +1,826 @@
1
+ """Markdown to HTML conversion utility.
2
+
3
+ Uses markdown-it-py for CommonMark-compliant conversion.
4
+ """
5
+
6
+ import re
7
+ import uuid
8
+ from collections.abc import Callable, Iterator
9
+ from contextlib import contextmanager
10
+
11
+ from markdown_it import MarkdownIt
12
+
13
+ from note_mcp.api.embeds import (
14
+ generate_embed_html,
15
+ get_embed_service,
16
+ )
17
+
18
+ # Pre-compiled regex patterns for performance
19
+ # TOC pattern: [TOC] must be alone on a line
20
+ _TOC_PATTERN = re.compile(r"^\[TOC\]$", re.MULTILINE)
21
+ # TOC placeholder (text marker, not HTML comment)
22
+ # Must match TOC_PLACEHOLDER in toc_helpers.py
23
+ _TOC_PLACEHOLDER = "§§TOC§§"
24
+ # Pattern to match TOC placeholder wrapped in paragraph tags (after Markdown conversion)
25
+ _TOC_PLACEHOLDER_HTML_PATTERN = re.compile(
26
+ r'<p\s+name="([^"]+)"\s+id="([^"]+)">' + re.escape(_TOC_PLACEHOLDER) + r"</p>",
27
+ re.IGNORECASE,
28
+ )
29
+
30
+ # Note: <li> and <blockquote> are excluded because note.com doesn't add name/id to these tags
31
+ _TAG_PATTERN = re.compile(
32
+ r"<(p|h[1-6]|ul|ol|code|hr|div|span)(\s[^>]*)?>",
33
+ re.IGNORECASE,
34
+ )
35
+ _PRE_PATTERN = re.compile(r"<pre([^>]*)>(.*?)</pre>", re.DOTALL | re.IGNORECASE)
36
+ _IMG_IN_P_PATTERN = re.compile(
37
+ r'<p>\s*<img\s+src="([^"]+)"\s+alt="([^"]*)"(?:\s+title="([^"]*)")?\s*/?\s*>\s*</p>',
38
+ re.IGNORECASE,
39
+ )
40
+ _LANGUAGE_CLASS_PATTERN = re.compile(r'<code[^>]*class="language-[^"]*"[^>]*>')
41
+ # Pattern to match li elements with direct text content (not already wrapped in p)
42
+ # This matches <li ...>content</li> where content doesn't start with <p
43
+ _LI_CONTENT_PATTERN = re.compile(
44
+ r"(<li[^>]*>)(?!<p)([^<]+|(?:(?!</li>).)*?)(</li>)",
45
+ re.IGNORECASE | re.DOTALL,
46
+ )
47
+ # Pattern to match blockquote elements and their content
48
+ _BLOCKQUOTE_PATTERN = re.compile(
49
+ r"(<blockquote[^>]*>)(.*?)(</blockquote>)",
50
+ re.DOTALL | re.IGNORECASE,
51
+ )
52
+ # Pattern to detect citation line: em-dash followed by space at start of line or after <br>
53
+ # Matches: "— Text" or "<br>— Text" at the end of content
54
+ _CITATION_PATTERN = re.compile(
55
+ r"(?:^|<br>)(—\s+.+?)(?:</p>|$)",
56
+ re.IGNORECASE,
57
+ )
58
+ # Pattern to extract URL from citation: "Text (URL)"
59
+ _CITATION_URL_PATTERN = re.compile(r"^(.+?)\s+\((\S+)\)\s*$")
60
+
61
+ # Text alignment patterns (Issue #40)
62
+ # Center: ->text<- (must be at start of line, ends with <- at end of line)
63
+ # Right: ->text (must be at start of line, no closing marker)
64
+ # Left: <-text (must be at start of line)
65
+ # Order matters: center must be checked before right to avoid partial matches
66
+ _TEXT_ALIGN_CENTER_PATTERN = re.compile(r"^->(.+)<-$", re.MULTILINE)
67
+ _TEXT_ALIGN_RIGHT_PATTERN = re.compile(r"^->(.+)$", re.MULTILINE)
68
+ _TEXT_ALIGN_LEFT_PATTERN = re.compile(r"^<-(.+)$", re.MULTILINE)
69
+
70
+ # Pattern to find URLs that are alone on a line (potential embed URLs)
71
+ _STANDALONE_URL_PATTERN = re.compile(r"^(https?://\S+)$", re.MULTILINE)
72
+
73
+ # Stock notation patterns (Issue #216)
74
+ # Japanese stocks: ^5243 (4-5 digit security code) - must be alone on a line
75
+ _STOCK_JP_PATTERN = re.compile(r"^\^(\d{4,5})$", re.MULTILINE)
76
+ # US stocks: $GOOG (uppercase ticker) - must be alone on a line
77
+ _STOCK_US_PATTERN = re.compile(r"^\$([A-Z]+)$", re.MULTILINE)
78
+
79
+ # Pattern to match paragraphs containing alignment placeholders (for html_transformer)
80
+ _ALIGN_P_PATTERN = re.compile(
81
+ r"<p([^>]*)>§§ALIGN_(CENTER|RIGHT|LEFT)§§(.*?)§§/ALIGN§§</p>",
82
+ re.DOTALL | re.IGNORECASE,
83
+ )
84
+
85
+ # Pattern to match <p> content inside blockquotes (for html_transformer)
86
+ _P_IN_BLOCKQUOTE_PATTERN = re.compile(
87
+ r"(<blockquote[^>]*>.*?)(<p[^>]*>)(.*?)(</p>)(.*?</blockquote>)",
88
+ re.DOTALL | re.IGNORECASE,
89
+ )
90
+
91
+ # Pattern to match blockquote elements for figure wrapping (for html_transformer)
92
+ _BLOCKQUOTE_FIGURE_PATTERN = re.compile(
93
+ r"<blockquote[^>]*>(.*?)</blockquote>",
94
+ re.DOTALL | re.IGNORECASE,
95
+ )
96
+
97
+
98
+ @contextmanager
99
+ def _protect_code_blocks(content: str, prefix: str = "CODE_BLOCK") -> Iterator[tuple[str, list[tuple[str, str]]]]:
100
+ """Context manager for protecting code blocks during content processing.
101
+
102
+ Temporarily replaces fenced (```) and inline (`) code blocks with placeholders.
103
+ This prevents code block content from being processed by other transformations.
104
+
105
+ Args:
106
+ content: Content to protect code blocks in.
107
+ prefix: Prefix for placeholder names. Use unique prefixes to avoid conflicts.
108
+
109
+ Yields:
110
+ Tuple of (protected_content, code_blocks_list) where:
111
+ - protected_content: Content with code blocks replaced by placeholders
112
+ - code_blocks_list: List of (placeholder, original_code) tuples for restoration
113
+
114
+ Example:
115
+ with _protect_code_blocks(content, "ALIGN") as (protected, blocks):
116
+ # Process the protected content (reassign to protected)
117
+ protected = some_pattern.sub(replacement, protected)
118
+ return _restore_code_blocks(protected, blocks)
119
+ """
120
+ code_blocks: list[tuple[str, str]] = []
121
+
122
+ def protect(match: re.Match[str]) -> str:
123
+ placeholder = f"__{prefix}_{len(code_blocks)}__"
124
+ code_blocks.append((placeholder, match.group(0)))
125
+ return placeholder
126
+
127
+ # Protect fenced code blocks first (```)
128
+ protected = re.sub(r"```[\s\S]*?```", protect, content)
129
+ # Then protect inline code (`)
130
+ protected = re.sub(r"`[^`]+`", protect, protected)
131
+
132
+ yield protected, code_blocks
133
+
134
+
135
+ def html_transformer(
136
+ pattern: re.Pattern[str],
137
+ transformer: Callable[[re.Match[str]], str],
138
+ ) -> Callable[[str], str]:
139
+ """Create an HTML transformation function.
140
+
141
+ This Higher-Order Function reduces code duplication for pattern-based
142
+ HTML transformations that follow the pattern:
143
+ result = pattern.sub(transform_func, html)
144
+
145
+ Args:
146
+ pattern: Compiled regex pattern to match.
147
+ transformer: Function that takes a Match object and returns replacement string.
148
+
149
+ Returns:
150
+ Function that applies the transformation to HTML content.
151
+
152
+ Example:
153
+ >>> pattern = re.compile(r"<em>(.*?)</em>")
154
+ >>> strong_transformer = html_transformer(pattern, lambda m: f"<strong>{m.group(1)}</strong>")
155
+ >>> strong_transformer("<em>text</em>")
156
+ '<strong>text</strong>'
157
+ """
158
+
159
+ def transform(html: str) -> str:
160
+ return pattern.sub(transformer, html)
161
+
162
+ return transform
163
+
164
+
165
+ def _restore_code_blocks(content: str, blocks: list[tuple[str, str]]) -> str:
166
+ """Restore code blocks from placeholders.
167
+
168
+ Args:
169
+ content: Content with placeholders.
170
+ blocks: List of (placeholder, original) tuples from _protect_code_blocks.
171
+
172
+ Returns:
173
+ Content with placeholders replaced by original code blocks.
174
+ """
175
+ for placeholder, original in blocks:
176
+ content = content.replace(placeholder, original)
177
+ return content
178
+
179
+
180
+ def protected_content_transformer(
181
+ prefix: str,
182
+ ) -> Callable[[Callable[[str], str]], Callable[[str], str]]:
183
+ """Create a decorator that protects code blocks during transformation.
184
+
185
+ This Higher-Order Function reduces code duplication for pattern-based
186
+ Markdown transformations that need to preserve code blocks.
187
+
188
+ The decorator:
189
+ 1. Protects fenced (```) and inline (`) code blocks with placeholders
190
+ 2. Applies the transformation function
191
+ 3. Restores the original code blocks
192
+
193
+ Args:
194
+ prefix: Unique prefix for placeholders (e.g., "STOCK", "ALIGN", "TOC").
195
+ Use unique prefixes to avoid conflicts between transformations.
196
+
197
+ Returns:
198
+ Decorator function that wraps a transformation function.
199
+
200
+ Example:
201
+ >>> @protected_content_transformer("STOCK")
202
+ ... def convert_stock(content: str) -> str:
203
+ ... return pattern.sub(replacement, content)
204
+ >>> convert_stock("text with `^5243` in code") # code block preserved
205
+ """
206
+ import functools
207
+
208
+ def decorator(transform: Callable[[str], str]) -> Callable[[str], str]:
209
+ @functools.wraps(transform)
210
+ def wrapper(content: str) -> str:
211
+ with _protect_code_blocks(content, prefix) as (protected, blocks):
212
+ result = transform(protected)
213
+ return _restore_code_blocks(result, blocks)
214
+
215
+ return wrapper
216
+
217
+ return decorator
218
+
219
+
220
+ def has_embed_url(content: str) -> bool:
221
+ """Check if content contains URLs that should be embedded.
222
+
223
+ Detects YouTube, Twitter/X, note.com article, GitHub Gist, GitHub Repository,
224
+ noteマネー, Zenn.dev, Google Slides, and SpeakerDeck URLs that appear alone on
225
+ a line (indicating they should be embedded, not linked).
226
+
227
+ Uses get_embed_service() for URL detection (Issue #235: DRY principle).
228
+
229
+ Args:
230
+ content: Markdown content to check.
231
+
232
+ Returns:
233
+ True if content contains embed-worthy URLs.
234
+ """
235
+ # Find all standalone URLs (URLs alone on their own line)
236
+ for match in _STANDALONE_URL_PATTERN.finditer(content):
237
+ url = match.group(1)
238
+ # Check if this URL matches any embed pattern (using api.embeds patterns)
239
+ if get_embed_service(url) is not None:
240
+ return True
241
+ return False
242
+
243
+
244
+ # Pattern to match standalone embed URLs in HTML paragraphs
245
+ # Matches: <p name="..." id="...">https://youtube.com/watch?v=xxx</p>
246
+ # Uses negative lookbehind to exclude paragraphs inside list items
247
+ _STANDALONE_EMBED_URL_IN_HTML_PATTERN = re.compile(
248
+ r'(?<!<li>)<p\s+name="[^"]+"\s+id="[^"]+">(\s*)(https?://\S+?)(\s*)</p>',
249
+ re.IGNORECASE,
250
+ )
251
+
252
+
253
+ def _convert_standalone_embed_urls(html: str) -> str:
254
+ """Convert standalone embed URLs to figure elements.
255
+
256
+ Detects standalone URLs (URLs that are alone in a paragraph) and converts
257
+ supported embed URLs (YouTube, Twitter, note.com, GitHub Gist, GitHub Repository,
258
+ noteマネー, Zenn.dev, Google Slides, SpeakerDeck) to figure elements.
259
+
260
+ This function should be called after markdown conversion and UUID addition,
261
+ but before code block processing.
262
+
263
+ Args:
264
+ html: HTML content with paragraphs containing potential embed URLs.
265
+
266
+ Returns:
267
+ HTML with embed URLs converted to figure elements.
268
+ """
269
+
270
+ def replace_embed_url(match: re.Match[str]) -> str:
271
+ url = match.group(2).strip()
272
+
273
+ # Check if this URL is a supported embed URL
274
+ service = get_embed_service(url)
275
+ if service is None:
276
+ # Not an embed URL, keep original paragraph
277
+ return match.group(0)
278
+
279
+ # Generate embed HTML
280
+ return generate_embed_html(url, service)
281
+
282
+ return _STANDALONE_EMBED_URL_IN_HTML_PATTERN.sub(replace_embed_url, html)
283
+
284
+
285
+ def _generate_uuid() -> str:
286
+ """Generate a UUID for note.com element IDs."""
287
+ return str(uuid.uuid4())
288
+
289
+
290
+ def _extract_citation(blockquote_content: str) -> tuple[str, str]:
291
+ """Extract citation from blockquote content.
292
+
293
+ Detects citation lines starting with em-dash (—) followed by space.
294
+ Supports optional URL in parentheses: "— Source (https://example.com)"
295
+
296
+ Args:
297
+ blockquote_content: HTML content inside <blockquote> tags
298
+
299
+ Returns:
300
+ Tuple of (modified_content, figcaption_html):
301
+ - modified_content: blockquote content with citation line removed
302
+ - figcaption_html: HTML for figcaption element content (may be empty)
303
+
304
+ Examples:
305
+ >>> _extract_citation("<p>Quote<br>— Source</p>")
306
+ ('<p>Quote</p>', 'Source')
307
+ >>> _extract_citation("<p>Quote<br>— Source (https://example.com)</p>")
308
+ ('<p>Quote</p>', '<a href="https://example.com">Source</a>')
309
+ """
310
+ # Look for citation pattern: "<br>— text" or "— text" at end of content
311
+ # The pattern searches within <p> tags
312
+ match = _CITATION_PATTERN.search(blockquote_content)
313
+ if not match:
314
+ return blockquote_content, ""
315
+
316
+ citation_with_dash = match.group(1) # "— Text" or "— Text (URL)"
317
+ citation_text = citation_with_dash[2:].strip() # Remove "— " prefix
318
+
319
+ # Empty citation text
320
+ if not citation_text:
321
+ return blockquote_content, ""
322
+
323
+ # Remove the citation line from blockquote content
324
+ # Handle both "<br>— text" and standalone "— text"
325
+ full_match = match.group(0)
326
+ modified_content = blockquote_content.replace(full_match, "</p>")
327
+
328
+ # Check for URL pattern: "Text (URL)"
329
+ url_match = _CITATION_URL_PATTERN.match(citation_text)
330
+ if url_match:
331
+ text = url_match.group(1).strip()
332
+ url = url_match.group(2)
333
+ figcaption_html = f'<a href="{url}">{text}</a>'
334
+ else:
335
+ figcaption_html = citation_text
336
+
337
+ return modified_content, figcaption_html
338
+
339
+
340
+ @protected_content_transformer("STOCK")
341
+ def _convert_stock_notation(content: str) -> str:
342
+ """Convert stock notation to noteマネー URLs.
343
+
344
+ Converts stock notation markers BEFORE markdown conversion:
345
+ - ^5243 (Japanese stock) → https://money.note.com/companies/5243
346
+ - $GOOG (US stock) → https://money.note.com/us-companies/GOOG
347
+
348
+ Only converts notations that are alone on a line.
349
+ Code blocks are protected from conversion via decorator.
350
+
351
+ Issue #216: Support stock chart embedding via notation.
352
+
353
+ Args:
354
+ content: Markdown content with stock notations
355
+
356
+ Returns:
357
+ Content with stock notations converted to URLs
358
+ """
359
+ # Japanese stocks: ^5243 → https://money.note.com/companies/5243
360
+ content = _STOCK_JP_PATTERN.sub(r"https://money.note.com/companies/\1", content)
361
+ # US stocks: $GOOG → https://money.note.com/us-companies/GOOG
362
+ content = _STOCK_US_PATTERN.sub(r"https://money.note.com/us-companies/\1", content)
363
+ return content
364
+
365
+
366
+ @protected_content_transformer("ALIGN")
367
+ def _convert_text_alignment(content: str) -> str:
368
+ """Convert text alignment Markdown notation to internal placeholders.
369
+
370
+ This function processes text alignment markers BEFORE markdown conversion.
371
+ It converts the custom notation to placeholders that will be converted
372
+ to proper HTML after markdown processing.
373
+
374
+ Code blocks are protected from conversion via decorator.
375
+
376
+ Notation:
377
+ ->text<- : center alignment
378
+ ->text : right alignment
379
+ <-text : left alignment
380
+
381
+ Args:
382
+ content: Markdown content with alignment markers
383
+
384
+ Returns:
385
+ Content with alignment markers converted to placeholders
386
+ """
387
+ # Convert alignment markers to placeholders
388
+ # Order matters: center first (more specific), then right/left
389
+ content = _TEXT_ALIGN_CENTER_PATTERN.sub(r"§§ALIGN_CENTER§§\1§§/ALIGN§§", content)
390
+ content = _TEXT_ALIGN_RIGHT_PATTERN.sub(r"§§ALIGN_RIGHT§§\1§§/ALIGN§§", content)
391
+ content = _TEXT_ALIGN_LEFT_PATTERN.sub(r"§§ALIGN_LEFT§§\1§§/ALIGN§§", content)
392
+ return content
393
+
394
+
395
+ def _apply_alignment(match: re.Match[str]) -> str:
396
+ """Transform alignment placeholder to styled paragraph.
397
+
398
+ Args:
399
+ match: Regex match with groups:
400
+ - group(1): HTML attributes (e.g., ' name="..." id="..."')
401
+ - group(2): Alignment type (CENTER, RIGHT, or LEFT)
402
+ - group(3): Paragraph content
403
+
404
+ Returns:
405
+ HTML paragraph with text-align style applied.
406
+ """
407
+ attrs = match.group(1)
408
+ alignment = match.group(2).lower()
409
+ content = match.group(3)
410
+
411
+ # Add style attribute for text-align
412
+ style = f"text-align: {alignment}"
413
+
414
+ # If there are existing attributes, append style
415
+ if attrs and 'style="' in attrs:
416
+ # Append to existing style (unlikely but handle it)
417
+ attrs = attrs.replace('style="', f'style="{style}; ')
418
+ else:
419
+ # Add new style attribute before other attrs
420
+ attrs = f' style="{style}"' + (attrs or "")
421
+
422
+ return f"<p{attrs}>{content}</p>"
423
+
424
+
425
+ _apply_text_alignment_to_html = html_transformer(_ALIGN_P_PATTERN, _apply_alignment)
426
+ """Convert text alignment placeholders to HTML style attributes.
427
+
428
+ This function processes the alignment placeholders created by
429
+ _convert_text_alignment and converts them to proper HTML paragraphs
430
+ with text-align styles.
431
+
432
+ Args:
433
+ html: HTML content with alignment placeholders
434
+
435
+ Returns:
436
+ HTML with proper text-align styles applied
437
+ """
438
+
439
+
440
+ def _wrap_li_content_in_p(html: str) -> str:
441
+ """Wrap list item content in paragraph tags.
442
+
443
+ ProseMirror (used by note.com) expects list items to contain
444
+ block content like paragraphs, not just inline text.
445
+
446
+ Converts: <li>Item text</li>
447
+ To: <li><p>Item text</p></li>
448
+
449
+ Args:
450
+ html: HTML string with list items
451
+
452
+ Returns:
453
+ HTML with list item content wrapped in <p> tags
454
+ """
455
+
456
+ def wrap_content(match: re.Match[str]) -> str:
457
+ li_open = match.group(1) # <li ...>
458
+ content = match.group(2) # text content
459
+ li_close = match.group(3) # </li>
460
+
461
+ # Skip if content is empty or whitespace only
462
+ if not content or not content.strip():
463
+ return match.group(0)
464
+
465
+ return f"{li_open}<p>{content.strip()}</p>{li_close}"
466
+
467
+ return _LI_CONTENT_PATTERN.sub(wrap_content, html)
468
+
469
+
470
+ def _convert_p_newlines(match: re.Match[str]) -> str:
471
+ """Transform paragraph newlines to <br> tags inside blockquotes.
472
+
473
+ Args:
474
+ match: Regex match with groups:
475
+ - group(1): Content before <p> tag (including <blockquote>)
476
+ - group(2): Opening <p> tag (e.g., '<p name="..." id="...">')
477
+ - group(3): Paragraph content (text inside <p>)
478
+ - group(4): Closing </p> tag
479
+ - group(5): Content after </p> tag (including </blockquote>)
480
+
481
+ Returns:
482
+ Blockquote HTML with newlines converted to <br> tags.
483
+ """
484
+ before_p = match.group(1) # <blockquote...> and anything before <p>
485
+ p_open = match.group(2) # <p ...>
486
+ p_content = match.group(3) # content inside <p>
487
+ p_close = match.group(4) # </p>
488
+ after_p = match.group(5) # anything after </p> including </blockquote>
489
+
490
+ # Convert newlines to <br> tags (note.com uses <br> without slash)
491
+ p_content = p_content.replace("\n", "<br>")
492
+
493
+ return f"{before_p}{p_open}{p_content}{p_close}{after_p}"
494
+
495
+
496
+ _convert_blockquote_newlines_to_br = html_transformer(_P_IN_BLOCKQUOTE_PATTERN, _convert_p_newlines)
497
+ """Convert newlines inside blockquote paragraphs to <br> tags.
498
+
499
+ note.com's browser editor uses <br> tags for line breaks inside blockquotes.
500
+ This function converts newlines to <br> tags to match that format.
501
+
502
+ Converts:
503
+ <blockquote><p>Line 1
504
+ Line 2</p></blockquote>
505
+ To:
506
+ <blockquote><p>Line 1<br>Line 2</p></blockquote>
507
+
508
+ Note: While this generates correct HTML with <br> tags, note.com's API
509
+ sanitizes <br> tags from blockquote content. This is a server-side
510
+ limitation. Content created via browser editor preserves <br> tags,
511
+ but API-submitted content has them stripped.
512
+
513
+ Workaround for users: Use separate blockquotes for each line:
514
+ > Line 1
515
+
516
+ > Line 2
517
+
518
+ Args:
519
+ html: HTML string with blockquotes
520
+
521
+ Returns:
522
+ HTML with blockquote paragraph newlines converted to <br> tags
523
+ """
524
+
525
+
526
+ def _wrap_in_figure(match: re.Match[str]) -> str:
527
+ """Transform blockquote to note.com figure format.
528
+
529
+ Args:
530
+ match: Regex match with groups:
531
+ - group(1): Blockquote inner content (HTML between <blockquote> tags)
532
+
533
+ Returns:
534
+ Blockquote wrapped in figure element with citation extracted.
535
+ """
536
+ blockquote_content = match.group(1)
537
+ element_id = _generate_uuid()
538
+
539
+ # Extract citation if present
540
+ modified_content, figcaption_html = _extract_citation(blockquote_content)
541
+
542
+ return (
543
+ f'<figure name="{element_id}" id="{element_id}">'
544
+ f"<blockquote>{modified_content}</blockquote>"
545
+ f"<figcaption>{figcaption_html}</figcaption></figure>"
546
+ )
547
+
548
+
549
+ _convert_blockquotes_to_note_format = html_transformer(_BLOCKQUOTE_FIGURE_PATTERN, _wrap_in_figure)
550
+ """Convert blockquotes to note.com figure format.
551
+
552
+ note.com expects blockquotes to be wrapped in <figure> elements:
553
+ <figure name="UUID" id="UUID">
554
+ <blockquote><p name="UUID" id="UUID">content</p></blockquote>
555
+ <figcaption>citation</figcaption>
556
+ </figure>
557
+
558
+ Citation is extracted from lines starting with em-dash (—):
559
+ - "— Source" becomes <figcaption>Source</figcaption>
560
+ - "— Source (URL)" becomes <figcaption><a href="URL">Source</a></figcaption>
561
+
562
+ This format is required for the API to preserve <br> tags inside blockquotes.
563
+
564
+ Args:
565
+ html: HTML string with blockquotes
566
+
567
+ Returns:
568
+ HTML with blockquotes wrapped in figure elements
569
+ """
570
+
571
+
572
+ def _add_uuid_to_elements(html: str) -> str:
573
+ """Add name attribute (UUID) to HTML elements.
574
+
575
+ note.com requires elements to have unique name attributes.
576
+ Note: <pre> tags are handled separately by _convert_code_blocks_to_note_format.
577
+ Note: <li> tags are excluded because note.com doesn't add name to <li> tags.
578
+
579
+ Args:
580
+ html: HTML string
581
+
582
+ Returns:
583
+ HTML with name attributes added to elements
584
+ """
585
+
586
+ def add_uuid(match: re.Match[str]) -> str:
587
+ tag_name = match.group(1)
588
+ attrs = match.group(2) or ""
589
+
590
+ # Skip if already has name attribute
591
+ if 'name="' in attrs:
592
+ return match.group(0)
593
+
594
+ element_id = _generate_uuid()
595
+ # note.com requires both 'name' and 'id' attributes for proper content handling
596
+ return f'<{tag_name} name="{element_id}" id="{element_id}"{attrs}>'
597
+
598
+ return _TAG_PATTERN.sub(add_uuid, html)
599
+
600
+
601
+ def _convert_images_to_note_format(html: str) -> str:
602
+ """Convert standard HTML img tags to note.com figure format.
603
+
604
+ note.com expects images in this format:
605
+ <figure name="UUID" id="UUID">
606
+ <img src="URL" alt="" width="620" height="457"
607
+ contenteditable="false" draggable="false">
608
+ <figcaption></figcaption>
609
+ </figure>
610
+
611
+ Args:
612
+ html: HTML string with standard img tags
613
+
614
+ Returns:
615
+ HTML with img tags converted to figure format
616
+ """
617
+
618
+ def replace_img(match: re.Match[str]) -> str:
619
+ src = match.group(1)
620
+ alt = match.group(2)
621
+ caption = match.group(3) or "" # titleがなければ空文字
622
+ element_id = _generate_uuid()
623
+ # note.com requires both 'name' and 'id' attributes for proper content handling
624
+ return (
625
+ f'<figure name="{element_id}" id="{element_id}">'
626
+ f'<img src="{src}" alt="{alt}" width="620" height="457" '
627
+ f'contenteditable="false" draggable="false">'
628
+ f"<figcaption>{caption}</figcaption></figure>"
629
+ )
630
+
631
+ return _IMG_IN_P_PATTERN.sub(replace_img, html)
632
+
633
+
634
+ def _convert_code_blocks_to_note_format(html: str) -> str:
635
+ """Convert code blocks to note.com format and handle newlines.
636
+
637
+ note.com requires:
638
+ - <pre class="codeBlock"> with name and id attributes
639
+ - <code> without language class
640
+ - Actual newlines preserved inside code blocks
641
+ - Newlines removed from other HTML elements
642
+
643
+ Uses placeholder approach to preserve newlines in code blocks
644
+ while removing them from the rest of the HTML.
645
+
646
+ Args:
647
+ html: HTML string with code blocks
648
+
649
+ Returns:
650
+ HTML with code blocks in note.com format
651
+ """
652
+ pre_blocks: list[str] = []
653
+
654
+ def convert_pre_block(match: re.Match[str]) -> str:
655
+ """Convert pre block to note.com format and store for later restoration."""
656
+ content = match.group(2)
657
+
658
+ # Generate fresh UUIDs for code blocks
659
+ element_id = _generate_uuid()
660
+
661
+ # Remove language class from <code> tag
662
+ # markdown-it-py adds class="language-xxx" which note.com doesn't use
663
+ content = _LANGUAGE_CLASS_PATTERN.sub("<code>", content)
664
+
665
+ # Build note.com format: <pre name="..." id="..." class="codeBlock">
666
+ # note.com requires both 'name' and 'id' attributes for proper content handling
667
+ pre_block = f'<pre name="{element_id}" id="{element_id}" class="codeBlock">{content}</pre>'
668
+ pre_blocks.append(pre_block)
669
+
670
+ return f"__PRE_BLOCK_{len(pre_blocks) - 1}__"
671
+
672
+ # Replace pre blocks with placeholders
673
+ result = _PRE_PATTERN.sub(convert_pre_block, html)
674
+
675
+ # Remove newlines from the rest of the HTML
676
+ result = result.replace("\n", "")
677
+
678
+ # Restore pre blocks with their preserved newlines
679
+ for i, block in enumerate(pre_blocks):
680
+ result = result.replace(f"__PRE_BLOCK_{i}__", block)
681
+
682
+ return result
683
+
684
+
685
+ def _has_toc_placeholder(content: str) -> bool:
686
+ """Check if content contains [TOC] placeholder.
687
+
688
+ Args:
689
+ content: Markdown content to check.
690
+
691
+ Returns:
692
+ True if [TOC] placeholder exists on its own line.
693
+ """
694
+ return bool(_TOC_PATTERN.search(content))
695
+
696
+
697
+ def _replace_toc_markers(content: str) -> str:
698
+ """Replace [TOC] markers with placeholder (first only, rest removed).
699
+
700
+ Internal function that handles the stateful replacement logic.
701
+ Only the first [TOC] is converted to placeholder, subsequent ones are removed.
702
+
703
+ Args:
704
+ content: Content with potential [TOC] markers.
705
+
706
+ Returns:
707
+ Content with [TOC] markers processed.
708
+ """
709
+ first_replaced = False
710
+
711
+ def replace_toc(match: re.Match[str]) -> str:
712
+ nonlocal first_replaced
713
+ if not first_replaced:
714
+ first_replaced = True
715
+ return _TOC_PLACEHOLDER
716
+ return "" # Remove subsequent [TOC]s
717
+
718
+ return _TOC_PATTERN.sub(replace_toc, content)
719
+
720
+
721
+ @protected_content_transformer("TOC")
722
+ def _convert_toc_to_placeholder(content: str) -> str:
723
+ """Convert first [TOC] to HTML placeholder.
724
+
725
+ Only the first [TOC] is converted. Subsequent ones are removed.
726
+ [TOC] inside code blocks is not processed via decorator.
727
+
728
+ Args:
729
+ content: Markdown content with potential [TOC] markers.
730
+
731
+ Returns:
732
+ Content with [TOC] converted to placeholder.
733
+ """
734
+ return _replace_toc_markers(content)
735
+
736
+
737
+ def _convert_toc_placeholder_to_html(html: str) -> str:
738
+ """Convert TOC placeholder in HTML to <table-of-contents> element.
739
+
740
+ Replaces <p name="..." id="...">§§TOC§§</p> with
741
+ <table-of-contents name="..." id="..."></table-of-contents>
742
+
743
+ This is called after markdown conversion and UUID addition to convert
744
+ the placeholder to the actual custom element that note.com preserves via API.
745
+
746
+ Issue #117: This enables TOC via API without browser automation.
747
+
748
+ Args:
749
+ html: HTML content with potential TOC placeholder
750
+
751
+ Returns:
752
+ HTML with TOC placeholder converted to <table-of-contents> element
753
+ """
754
+
755
+ def replace_with_toc(match: re.Match[str]) -> str:
756
+ name = match.group(1)
757
+ element_id = match.group(2)
758
+ return f'<table-of-contents name="{name}" id="{element_id}"></table-of-contents>'
759
+
760
+ return _TOC_PLACEHOLDER_HTML_PATTERN.sub(replace_with_toc, html)
761
+
762
+
763
+ def markdown_to_html(content: str) -> str:
764
+ """Convert Markdown content to HTML.
765
+
766
+ Uses markdown-it-py for CommonMark-compliant conversion.
767
+ Converts images to note.com's figure format.
768
+
769
+ Args:
770
+ content: Markdown formatted text
771
+
772
+ Returns:
773
+ HTML formatted text. Returns empty string for empty input.
774
+
775
+ Example:
776
+ >>> markdown_to_html("# Hello")
777
+ '<h1>Hello</h1>\\n'
778
+ """
779
+ if not content or not content.strip():
780
+ return ""
781
+
782
+ # 1. Convert [TOC] to placeholder FIRST (before any processing)
783
+ content = _convert_toc_to_placeholder(content)
784
+
785
+ # 2. Convert stock notation to URLs (Issue #216)
786
+ # Must be before markdown conversion so URLs can be processed as embeds
787
+ content = _convert_stock_notation(content)
788
+
789
+ # 3. Convert text alignment markers to placeholders BEFORE markdown conversion
790
+ content = _convert_text_alignment(content)
791
+
792
+ # 4. Markdown conversion
793
+ md = MarkdownIt().enable("strikethrough")
794
+ result: str = md.render(content)
795
+
796
+ # Convert images to note.com format
797
+ result = _convert_images_to_note_format(result)
798
+
799
+ # Wrap list item content in p tags (ProseMirror requirement)
800
+ result = _wrap_li_content_in_p(result)
801
+
802
+ # Convert blockquote newlines to <br> tags (note.com browser editor format)
803
+ result = _convert_blockquote_newlines_to_br(result)
804
+
805
+ # Add UUID to all elements (note.com requirement)
806
+ result = _add_uuid_to_elements(result)
807
+
808
+ # Convert TOC placeholder to <table-of-contents> element (Issue #117)
809
+ # Must be after UUID addition to preserve name/id attributes
810
+ result = _convert_toc_placeholder_to_html(result)
811
+
812
+ # Apply text alignment styles to paragraphs (must be after UUID addition)
813
+ result = _apply_text_alignment_to_html(result)
814
+
815
+ # Convert blockquotes to note.com figure format
816
+ # This is required for the API to preserve <br> tags inside blockquotes
817
+ result = _convert_blockquotes_to_note_format(result)
818
+
819
+ # Convert standalone embed URLs to figure elements (Issue #116)
820
+ # YouTube, Twitter, note.com URLs alone in a paragraph become embeds
821
+ result = _convert_standalone_embed_urls(result)
822
+
823
+ # Convert code blocks to note.com format and handle newlines
824
+ result = _convert_code_blocks_to_note_format(result)
825
+
826
+ return result