notionary 0.2.28__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- notionary/__init__.py +9 -2
- notionary/blocks/__init__.py +5 -0
- notionary/blocks/client.py +6 -4
- notionary/blocks/enums.py +28 -1
- notionary/blocks/rich_text/markdown_rich_text_converter.py +14 -0
- notionary/blocks/rich_text/models.py +14 -0
- notionary/blocks/rich_text/name_id_resolver/__init__.py +2 -0
- notionary/blocks/rich_text/name_id_resolver/data_source.py +32 -0
- notionary/blocks/rich_text/rich_text_markdown_converter.py +12 -0
- notionary/blocks/rich_text/rich_text_patterns.py +3 -0
- notionary/blocks/schemas.py +42 -10
- notionary/comments/__init__.py +5 -0
- notionary/comments/client.py +7 -10
- notionary/comments/factory.py +4 -6
- notionary/data_source/http/data_source_instance_client.py +14 -4
- notionary/data_source/properties/{models.py → schemas.py} +4 -8
- notionary/data_source/query/__init__.py +9 -0
- notionary/data_source/query/builder.py +38 -10
- notionary/data_source/query/schema.py +13 -10
- notionary/data_source/query/validator.py +11 -11
- notionary/data_source/schema/registry.py +104 -0
- notionary/data_source/schema/service.py +136 -0
- notionary/data_source/schemas.py +1 -1
- notionary/data_source/service.py +29 -103
- notionary/database/service.py +17 -60
- notionary/exceptions/__init__.py +5 -1
- notionary/exceptions/block_parsing.py +21 -0
- notionary/exceptions/search.py +24 -0
- notionary/http/client.py +9 -10
- notionary/http/models.py +5 -4
- notionary/page/content/factory.py +10 -3
- notionary/page/content/markdown/builder.py +76 -154
- notionary/page/content/markdown/nodes/__init__.py +0 -2
- notionary/page/content/markdown/nodes/audio.py +1 -1
- notionary/page/content/markdown/nodes/base.py +1 -1
- notionary/page/content/markdown/nodes/bookmark.py +1 -1
- notionary/page/content/markdown/nodes/breadcrumb.py +1 -1
- notionary/page/content/markdown/nodes/bulleted_list.py +31 -8
- notionary/page/content/markdown/nodes/callout.py +12 -10
- notionary/page/content/markdown/nodes/code.py +3 -5
- notionary/page/content/markdown/nodes/columns.py +39 -21
- notionary/page/content/markdown/nodes/container.py +64 -0
- notionary/page/content/markdown/nodes/divider.py +1 -1
- notionary/page/content/markdown/nodes/embed.py +1 -1
- notionary/page/content/markdown/nodes/equation.py +1 -1
- notionary/page/content/markdown/nodes/file.py +1 -1
- notionary/page/content/markdown/nodes/heading.py +26 -6
- notionary/page/content/markdown/nodes/image.py +1 -1
- notionary/page/content/markdown/nodes/mixins/__init__.py +5 -0
- notionary/page/content/markdown/nodes/mixins/caption.py +1 -1
- notionary/page/content/markdown/nodes/numbered_list.py +28 -5
- notionary/page/content/markdown/nodes/paragraph.py +1 -1
- notionary/page/content/markdown/nodes/pdf.py +1 -1
- notionary/page/content/markdown/nodes/quote.py +17 -5
- notionary/page/content/markdown/nodes/space.py +1 -1
- notionary/page/content/markdown/nodes/table.py +1 -1
- notionary/page/content/markdown/nodes/table_of_contents.py +1 -1
- notionary/page/content/markdown/nodes/todo.py +23 -7
- notionary/page/content/markdown/nodes/toggle.py +13 -14
- notionary/page/content/markdown/nodes/video.py +1 -1
- notionary/page/content/parser/context.py +98 -21
- notionary/page/content/parser/factory.py +1 -10
- notionary/page/content/parser/parsers/__init__.py +0 -2
- notionary/page/content/parser/parsers/audio.py +1 -1
- notionary/page/content/parser/parsers/base.py +1 -1
- notionary/page/content/parser/parsers/bookmark.py +1 -1
- notionary/page/content/parser/parsers/breadcrumb.py +1 -1
- notionary/page/content/parser/parsers/bulleted_list.py +52 -8
- notionary/page/content/parser/parsers/callout.py +55 -84
- notionary/page/content/parser/parsers/caption.py +1 -1
- notionary/page/content/parser/parsers/code.py +5 -5
- notionary/page/content/parser/parsers/column.py +23 -64
- notionary/page/content/parser/parsers/column_list.py +45 -45
- notionary/page/content/parser/parsers/divider.py +1 -1
- notionary/page/content/parser/parsers/embed.py +1 -1
- notionary/page/content/parser/parsers/equation.py +1 -1
- notionary/page/content/parser/parsers/file.py +1 -1
- notionary/page/content/parser/parsers/heading.py +65 -8
- notionary/page/content/parser/parsers/image.py +1 -1
- notionary/page/content/parser/parsers/numbered_list.py +52 -8
- notionary/page/content/parser/parsers/paragraph.py +3 -2
- notionary/page/content/parser/parsers/pdf.py +1 -1
- notionary/page/content/parser/parsers/quote.py +75 -15
- notionary/page/content/parser/parsers/space.py +14 -8
- notionary/page/content/parser/parsers/table.py +1 -1
- notionary/page/content/parser/parsers/table_of_contents.py +1 -1
- notionary/page/content/parser/parsers/todo.py +57 -19
- notionary/page/content/parser/parsers/toggle.py +17 -74
- notionary/page/content/parser/parsers/video.py +1 -1
- notionary/page/content/parser/post_processing/handlers/rich_text_length.py +6 -4
- notionary/page/content/parser/post_processing/handlers/rich_text_length_truncation.py +43 -22
- notionary/page/content/parser/pre_processsing/handlers/__init__.py +4 -0
- notionary/page/content/parser/pre_processsing/handlers/column_syntax.py +108 -54
- notionary/page/content/parser/pre_processsing/handlers/indentation.py +86 -0
- notionary/page/content/parser/pre_processsing/handlers/video_syntax.py +66 -0
- notionary/page/content/parser/pre_processsing/handlers/whitespace.py +14 -7
- notionary/page/content/parser/service.py +9 -0
- notionary/page/content/renderer/context.py +5 -2
- notionary/page/content/renderer/factory.py +2 -11
- notionary/page/content/renderer/post_processing/handlers/__init__.py +2 -2
- notionary/page/content/renderer/post_processing/handlers/numbered_list.py +156 -0
- notionary/page/content/renderer/renderers/__init__.py +0 -2
- notionary/page/content/renderer/renderers/base.py +1 -1
- notionary/page/content/renderer/renderers/bulleted_list.py +1 -1
- notionary/page/content/renderer/renderers/callout.py +6 -21
- notionary/page/content/renderer/renderers/captioned_block.py +1 -1
- notionary/page/content/renderer/renderers/column.py +28 -19
- notionary/page/content/renderer/renderers/column_list.py +24 -11
- notionary/page/content/renderer/renderers/heading.py +53 -27
- notionary/page/content/renderer/renderers/numbered_list.py +6 -5
- notionary/page/content/renderer/renderers/quote.py +1 -1
- notionary/page/content/renderer/renderers/todo.py +1 -1
- notionary/page/content/renderer/renderers/toggle.py +6 -7
- notionary/page/content/service.py +4 -1
- notionary/page/content/syntax/__init__.py +4 -0
- notionary/page/content/syntax/grammar.py +10 -0
- notionary/page/content/syntax/models.py +0 -2
- notionary/page/content/syntax/{service.py → registry.py} +31 -91
- notionary/page/properties/client.py +3 -3
- notionary/page/properties/models.py +3 -2
- notionary/page/properties/service.py +18 -3
- notionary/page/service.py +22 -80
- notionary/shared/entity/service.py +94 -36
- notionary/shared/models/cover.py +1 -1
- notionary/shared/typings.py +3 -0
- notionary/user/base.py +60 -11
- notionary/user/factory.py +0 -0
- notionary/utils/decorators.py +122 -0
- notionary/utils/fuzzy.py +18 -6
- notionary/utils/mixins/logging.py +38 -27
- notionary/utils/pagination.py +70 -16
- notionary/workspace/__init__.py +2 -1
- notionary/workspace/client.py +4 -2
- notionary/workspace/query/__init__.py +3 -0
- notionary/workspace/query/builder.py +25 -1
- notionary/workspace/query/models.py +12 -3
- notionary/workspace/query/service.py +57 -32
- notionary/workspace/service.py +31 -21
- {notionary-0.2.28.dist-info → notionary-0.3.1.dist-info}/METADATA +35 -105
- notionary-0.3.1.dist-info/RECORD +211 -0
- notionary/page/content/markdown/nodes/toggleable_heading.py +0 -35
- notionary/page/content/parser/parsers/toggleable_heading.py +0 -150
- notionary/page/content/renderer/post_processing/handlers/numbered_list_placeholdere.py +0 -62
- notionary/page/content/renderer/renderers/toggleable_heading.py +0 -78
- notionary/utils/async_retry.py +0 -39
- notionary/utils/singleton.py +0 -13
- notionary-0.2.28.dist-info/RECORD +0 -200
- {notionary-0.2.28.dist-info → notionary-0.3.1.dist-info}/WHEEL +0 -0
- {notionary-0.2.28.dist-info → notionary-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,9 +1,13 @@
|
|
|
1
1
|
from .column_syntax import ColumnSyntaxPreProcessor
|
|
2
|
+
from .indentation import IndentationNormalizer
|
|
2
3
|
from .port import PreProcessor
|
|
4
|
+
from .video_syntax import VideoFormatPreProcessor
|
|
3
5
|
from .whitespace import WhitespacePreProcessor
|
|
4
6
|
|
|
5
7
|
__all__ = [
|
|
6
8
|
"ColumnSyntaxPreProcessor",
|
|
9
|
+
"IndentationNormalizer",
|
|
7
10
|
"PreProcessor",
|
|
11
|
+
"VideoFormatPreProcessor",
|
|
8
12
|
"WhitespacePreProcessor",
|
|
9
13
|
]
|
|
@@ -3,78 +3,132 @@ from typing import override
|
|
|
3
3
|
|
|
4
4
|
from notionary.exceptions.block_parsing import InsufficientColumnsError, InvalidColumnRatioSumError
|
|
5
5
|
from notionary.page.content.parser.pre_processsing.handlers.port import PreProcessor
|
|
6
|
-
from notionary.page.content.syntax
|
|
6
|
+
from notionary.page.content.syntax import MarkdownGrammar, SyntaxRegistry
|
|
7
|
+
from notionary.utils.decorators import time_execution_sync
|
|
8
|
+
from notionary.utils.mixins.logging import LoggingMixin
|
|
7
9
|
|
|
8
|
-
RATIO_TOLERANCE = 0.0001
|
|
9
10
|
|
|
11
|
+
class ColumnSyntaxPreProcessor(PreProcessor, LoggingMixin):
|
|
12
|
+
_RATIO_TOLERANCE = 0.0001
|
|
13
|
+
_MINIMUM_COLUMNS = 2
|
|
10
14
|
|
|
11
|
-
|
|
12
|
-
|
|
15
|
+
def __init__(
|
|
16
|
+
self, syntax_registry: SyntaxRegistry | None = None, markdown_grammar: MarkdownGrammar | None = None
|
|
17
|
+
) -> None:
|
|
18
|
+
super().__init__()
|
|
13
19
|
self._syntax_registry = syntax_registry or SyntaxRegistry()
|
|
14
|
-
self.
|
|
15
|
-
|
|
20
|
+
self._markdown_grammar = markdown_grammar or MarkdownGrammar()
|
|
21
|
+
|
|
22
|
+
self._spaces_per_nesting_level = self._markdown_grammar.spaces_per_nesting_level
|
|
23
|
+
self._column_list_delimiter = self._syntax_registry.get_column_list_syntax().start_delimiter
|
|
24
|
+
self._column_delimiter = self._syntax_registry.get_column_syntax().start_delimiter
|
|
25
|
+
self._column_pattern = self._syntax_registry.get_column_syntax().regex_pattern
|
|
16
26
|
|
|
17
27
|
@override
|
|
28
|
+
@time_execution_sync()
|
|
18
29
|
def process(self, markdown_text: str) -> str:
|
|
19
|
-
if not self.
|
|
30
|
+
if not self._contains_column_lists(markdown_text):
|
|
20
31
|
return markdown_text
|
|
21
32
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
for content in columns_blocks:
|
|
25
|
-
column_matches = self._find_column_blocks(content)
|
|
26
|
-
column_count = len(column_matches)
|
|
27
|
-
self._validate_minimum_columns(column_count)
|
|
28
|
-
ratios = self._extract_ratios(column_matches)
|
|
29
|
-
self._validate_ratios(ratios, column_count)
|
|
33
|
+
self._validate_all_column_lists(markdown_text)
|
|
30
34
|
return markdown_text
|
|
31
35
|
|
|
32
|
-
def
|
|
33
|
-
return self.
|
|
36
|
+
def _contains_column_lists(self, markdown_text: str) -> bool:
|
|
37
|
+
return self._column_list_delimiter in markdown_text
|
|
38
|
+
|
|
39
|
+
def _validate_all_column_lists(self, markdown_text: str) -> None:
|
|
40
|
+
column_list_blocks = self._extract_column_list_blocks(markdown_text)
|
|
41
|
+
|
|
42
|
+
for block in column_list_blocks:
|
|
43
|
+
self._validate_column_list_block(block)
|
|
34
44
|
|
|
35
|
-
def
|
|
36
|
-
columns_blocks = []
|
|
45
|
+
def _extract_column_list_blocks(self, markdown_text: str) -> list[str]:
|
|
37
46
|
lines = markdown_text.split("\n")
|
|
47
|
+
blocks = []
|
|
48
|
+
|
|
38
49
|
for index, line in enumerate(lines):
|
|
39
|
-
if
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
return
|
|
44
|
-
|
|
45
|
-
def
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
50
|
+
if self._is_column_list_start(line):
|
|
51
|
+
block_content = self._extract_indented_block(lines, index + 1)
|
|
52
|
+
blocks.append(block_content)
|
|
53
|
+
|
|
54
|
+
return blocks
|
|
55
|
+
|
|
56
|
+
def _is_column_list_start(self, line: str) -> bool:
|
|
57
|
+
return line.strip() == self._column_list_delimiter
|
|
58
|
+
|
|
59
|
+
def _extract_indented_block(self, lines: list[str], start_index: int) -> str:
|
|
60
|
+
if start_index >= len(lines):
|
|
61
|
+
return ""
|
|
62
|
+
|
|
63
|
+
base_indentation = self._get_indentation_level(lines[start_index])
|
|
64
|
+
base_spaces = base_indentation * self._spaces_per_nesting_level
|
|
65
|
+
block_lines = []
|
|
66
|
+
|
|
67
|
+
for line in lines[start_index:]:
|
|
68
|
+
if self._is_empty_line(line):
|
|
69
|
+
block_lines.append(line)
|
|
70
|
+
continue
|
|
71
|
+
|
|
72
|
+
current_indentation = self._get_indentation_level(line)
|
|
73
|
+
|
|
74
|
+
if current_indentation < base_indentation:
|
|
75
|
+
break
|
|
76
|
+
|
|
77
|
+
block_lines.append(line[base_spaces:] if len(line) >= base_spaces else line)
|
|
78
|
+
|
|
79
|
+
return "\n".join(block_lines)
|
|
80
|
+
|
|
81
|
+
def _is_empty_line(self, line: str) -> bool:
|
|
82
|
+
return not line.strip()
|
|
83
|
+
|
|
84
|
+
def _get_indentation_level(self, line: str) -> int:
|
|
85
|
+
leading_spaces = len(line) - len(line.lstrip())
|
|
86
|
+
return leading_spaces // self._spaces_per_nesting_level
|
|
87
|
+
|
|
88
|
+
def _validate_column_list_block(self, block_content: str) -> None:
|
|
89
|
+
column_matches = self._find_all_columns(block_content)
|
|
90
|
+
column_count = len(column_matches)
|
|
91
|
+
|
|
92
|
+
self._validate_minimum_column_count(column_count)
|
|
93
|
+
|
|
94
|
+
ratios = self._extract_column_ratios(column_matches)
|
|
95
|
+
self._validate_ratio_sum(ratios, column_count)
|
|
96
|
+
|
|
97
|
+
def _find_all_columns(self, content: str) -> list[re.Match]:
|
|
98
|
+
return list(self._column_pattern.finditer(content))
|
|
99
|
+
|
|
100
|
+
def _validate_minimum_column_count(self, column_count: int) -> None:
|
|
101
|
+
if column_count < self._MINIMUM_COLUMNS:
|
|
102
|
+
self.logger.error(
|
|
103
|
+
f"Column list must contain at least {self._MINIMUM_COLUMNS} columns, found {column_count}"
|
|
104
|
+
)
|
|
65
105
|
raise InsufficientColumnsError(column_count)
|
|
66
106
|
|
|
67
|
-
def
|
|
107
|
+
def _extract_column_ratios(self, column_matches: list[re.Match]) -> list[float]:
|
|
68
108
|
ratios = []
|
|
109
|
+
|
|
69
110
|
for match in column_matches:
|
|
70
|
-
|
|
71
|
-
if
|
|
72
|
-
ratios.append(float(
|
|
111
|
+
ratio_text = match.group(1)
|
|
112
|
+
if self._has_explicit_ratio(ratio_text):
|
|
113
|
+
ratios.append(float(ratio_text))
|
|
114
|
+
|
|
73
115
|
return ratios
|
|
74
116
|
|
|
75
|
-
def
|
|
76
|
-
|
|
117
|
+
def _has_explicit_ratio(self, ratio_text: str | None) -> bool:
|
|
118
|
+
return ratio_text is not None and ratio_text != "1"
|
|
119
|
+
|
|
120
|
+
def _validate_ratio_sum(self, ratios: list[float], column_count: int) -> None:
|
|
121
|
+
if not self._should_validate_ratios(ratios, column_count):
|
|
77
122
|
return
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
123
|
+
|
|
124
|
+
total_ratio = sum(ratios)
|
|
125
|
+
|
|
126
|
+
if not self._is_ratio_sum_valid(total_ratio):
|
|
127
|
+
self.logger.error(f"Column ratios must sum to 1.0 (±{self._RATIO_TOLERANCE}), but sum to {total_ratio:.4f}")
|
|
128
|
+
raise InvalidColumnRatioSumError(total_ratio, self._RATIO_TOLERANCE)
|
|
129
|
+
|
|
130
|
+
def _should_validate_ratios(self, ratios: list[float], column_count: int) -> bool:
|
|
131
|
+
return len(ratios) > 0 and len(ratios) == column_count
|
|
132
|
+
|
|
133
|
+
def _is_ratio_sum_valid(self, total: float) -> bool:
|
|
134
|
+
return abs(total - 1.0) <= self._RATIO_TOLERANCE
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import math
|
|
2
|
+
from typing import override
|
|
3
|
+
|
|
4
|
+
from notionary.page.content.parser.pre_processsing.handlers.port import PreProcessor
|
|
5
|
+
from notionary.page.content.syntax import MarkdownGrammar, SyntaxRegistry
|
|
6
|
+
from notionary.utils.decorators import time_execution_sync
|
|
7
|
+
from notionary.utils.mixins.logging import LoggingMixin
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class IndentationNormalizer(PreProcessor, LoggingMixin):
|
|
11
|
+
def __init__(
|
|
12
|
+
self, syntax_registry: SyntaxRegistry | None = None, markdown_grammar: MarkdownGrammar | None = None
|
|
13
|
+
) -> None:
|
|
14
|
+
super().__init__()
|
|
15
|
+
self._syntax_registry = syntax_registry or SyntaxRegistry()
|
|
16
|
+
self._markdown_grammar = markdown_grammar or MarkdownGrammar()
|
|
17
|
+
|
|
18
|
+
self._spaces_per_nesting_level = self._markdown_grammar.spaces_per_nesting_level
|
|
19
|
+
self._code_block_start_delimiter = self._syntax_registry.get_code_syntax().start_delimiter
|
|
20
|
+
|
|
21
|
+
@override
|
|
22
|
+
@time_execution_sync()
|
|
23
|
+
def process(self, markdown_text: str) -> str:
|
|
24
|
+
if self._is_empty(markdown_text):
|
|
25
|
+
return ""
|
|
26
|
+
|
|
27
|
+
normalized = self._normalize_to_markdown_indentation(markdown_text)
|
|
28
|
+
|
|
29
|
+
if normalized != markdown_text:
|
|
30
|
+
self.logger.warning(
|
|
31
|
+
"Corrected non-standard indentation. Check the result for formatting errors and use consistent indentation in the source."
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
return normalized
|
|
35
|
+
|
|
36
|
+
def _is_empty(self, text: str) -> bool:
|
|
37
|
+
return not text
|
|
38
|
+
|
|
39
|
+
def _normalize_to_markdown_indentation(self, markdown_text: str) -> str:
|
|
40
|
+
lines = markdown_text.split("\n")
|
|
41
|
+
processed_lines = []
|
|
42
|
+
inside_code_block = False
|
|
43
|
+
|
|
44
|
+
for line in lines:
|
|
45
|
+
if self._is_code_fence(line):
|
|
46
|
+
inside_code_block = not inside_code_block
|
|
47
|
+
processed_lines.append(line)
|
|
48
|
+
elif inside_code_block:
|
|
49
|
+
processed_lines.append(line)
|
|
50
|
+
else:
|
|
51
|
+
processed_lines.append(self._normalize_to_standard_indentation(line))
|
|
52
|
+
|
|
53
|
+
return "\n".join(processed_lines)
|
|
54
|
+
|
|
55
|
+
def _is_code_fence(self, line: str) -> bool:
|
|
56
|
+
return line.lstrip().startswith(self._code_block_start_delimiter)
|
|
57
|
+
|
|
58
|
+
def _normalize_to_standard_indentation(self, line: str) -> str:
|
|
59
|
+
if self._is_blank_line(line):
|
|
60
|
+
return ""
|
|
61
|
+
|
|
62
|
+
indentation_level = self._round_to_nearest_indentation_level(line)
|
|
63
|
+
content = self._extract_content(line)
|
|
64
|
+
|
|
65
|
+
return self._build_indented_line(indentation_level, content)
|
|
66
|
+
|
|
67
|
+
def _is_blank_line(self, line: str) -> bool:
|
|
68
|
+
return not line.strip()
|
|
69
|
+
|
|
70
|
+
def _round_to_nearest_indentation_level(self, line: str) -> int:
|
|
71
|
+
leading_spaces = self._count_leading_spaces(line)
|
|
72
|
+
return math.ceil(leading_spaces / self._spaces_per_nesting_level)
|
|
73
|
+
|
|
74
|
+
def _count_leading_spaces(self, line: str) -> int:
|
|
75
|
+
return len(line) - len(line.lstrip())
|
|
76
|
+
|
|
77
|
+
def _extract_content(self, line: str) -> str:
|
|
78
|
+
return line.lstrip()
|
|
79
|
+
|
|
80
|
+
def _build_indented_line(self, level: int, content: str) -> str:
|
|
81
|
+
standard_indent = self._create_standard_indent(level)
|
|
82
|
+
return standard_indent + content
|
|
83
|
+
|
|
84
|
+
def _create_standard_indent(self, level: int) -> str:
|
|
85
|
+
spaces = level * self._spaces_per_nesting_level
|
|
86
|
+
return " " * spaces
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import override
|
|
3
|
+
from urllib.parse import urlparse
|
|
4
|
+
|
|
5
|
+
from notionary.blocks.enums import VideoFileType
|
|
6
|
+
from notionary.exceptions import UnsupportedVideoFormatError
|
|
7
|
+
from notionary.page.content.parser.pre_processsing.handlers.port import PreProcessor
|
|
8
|
+
from notionary.page.content.syntax import SyntaxRegistry
|
|
9
|
+
from notionary.utils.decorators import time_execution_sync
|
|
10
|
+
from notionary.utils.mixins.logging import LoggingMixin
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class VideoFormatPreProcessor(PreProcessor, LoggingMixin):
|
|
14
|
+
YOUTUBE_WATCH_PATTERN = re.compile(r"^https?://(?:www\.)?youtube\.com/watch\?.*v=[\w-]+", re.IGNORECASE)
|
|
15
|
+
YOUTUBE_EMBED_PATTERN = re.compile(r"^https?://(?:www\.)?youtube\.com/embed/[\w-]+", re.IGNORECASE)
|
|
16
|
+
|
|
17
|
+
def __init__(self, syntax_registry: SyntaxRegistry | None = None) -> None:
|
|
18
|
+
super().__init__()
|
|
19
|
+
self._syntax_registry = syntax_registry or SyntaxRegistry()
|
|
20
|
+
self._video_syntax = self._syntax_registry.get_video_syntax()
|
|
21
|
+
|
|
22
|
+
@override
|
|
23
|
+
@time_execution_sync()
|
|
24
|
+
def process(self, markdown_text: str) -> str:
|
|
25
|
+
lines = markdown_text.split("\n")
|
|
26
|
+
validated_lines = [self._validate_or_reject_line(line) for line in lines]
|
|
27
|
+
return "\n".join(validated_lines)
|
|
28
|
+
|
|
29
|
+
def _validate_or_reject_line(self, line: str) -> str:
|
|
30
|
+
if not self._contains_video_block(line):
|
|
31
|
+
return line
|
|
32
|
+
|
|
33
|
+
url = self._extract_url_from_video_block(line)
|
|
34
|
+
|
|
35
|
+
if self._is_supported_video_url(url):
|
|
36
|
+
return line
|
|
37
|
+
|
|
38
|
+
supported_formats = list(VideoFileType.get_all_extensions())
|
|
39
|
+
raise UnsupportedVideoFormatError(url, supported_formats)
|
|
40
|
+
|
|
41
|
+
def _contains_video_block(self, line: str) -> bool:
|
|
42
|
+
return self._video_syntax.regex_pattern.search(line) is not None
|
|
43
|
+
|
|
44
|
+
def _extract_url_from_video_block(self, line: str) -> str:
|
|
45
|
+
match = self._video_syntax.regex_pattern.search(line)
|
|
46
|
+
return match.group(1).strip() if match else ""
|
|
47
|
+
|
|
48
|
+
def _is_supported_video_url(self, url: str) -> bool:
|
|
49
|
+
return (
|
|
50
|
+
self._is_youtube_video(url)
|
|
51
|
+
or self._has_valid_video_extension(url)
|
|
52
|
+
or self._url_path_has_valid_extension(url)
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
def _is_youtube_video(self, url: str) -> bool:
|
|
56
|
+
return bool(self.YOUTUBE_WATCH_PATTERN.match(url) or self.YOUTUBE_EMBED_PATTERN.match(url))
|
|
57
|
+
|
|
58
|
+
def _has_valid_video_extension(self, url: str) -> bool:
|
|
59
|
+
return VideoFileType.is_valid_extension(url)
|
|
60
|
+
|
|
61
|
+
def _url_path_has_valid_extension(self, url: str) -> bool:
|
|
62
|
+
try:
|
|
63
|
+
parsed_url = urlparse(url)
|
|
64
|
+
return VideoFileType.is_valid_extension(parsed_url.path.lower())
|
|
65
|
+
except Exception:
|
|
66
|
+
return False
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
from typing import override
|
|
2
2
|
|
|
3
3
|
from notionary.page.content.parser.pre_processsing.handlers.port import PreProcessor
|
|
4
|
+
from notionary.utils.decorators import time_execution_sync
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
class WhitespacePreProcessor(PreProcessor):
|
|
7
8
|
@override
|
|
9
|
+
@time_execution_sync()
|
|
8
10
|
def process(self, markdown_text: str) -> str:
|
|
9
11
|
if not markdown_text:
|
|
10
12
|
return ""
|
|
@@ -12,23 +14,34 @@ class WhitespacePreProcessor(PreProcessor):
|
|
|
12
14
|
lines = markdown_text.split("\n")
|
|
13
15
|
processed_lines = []
|
|
14
16
|
code_block_lines = []
|
|
17
|
+
non_code_lines = []
|
|
15
18
|
in_code_block = False
|
|
16
19
|
|
|
17
20
|
for line in lines:
|
|
18
21
|
if self._is_code_fence(line):
|
|
19
22
|
if in_code_block:
|
|
23
|
+
# Format and add code block
|
|
20
24
|
processed_lines.extend(self._format_code_block(code_block_lines))
|
|
21
25
|
processed_lines.append("```")
|
|
22
26
|
code_block_lines = []
|
|
23
27
|
in_code_block = False
|
|
24
28
|
else:
|
|
29
|
+
# Format accumulated non-code lines before starting code block
|
|
30
|
+
if non_code_lines:
|
|
31
|
+
processed_lines.extend(self._format_code_block(non_code_lines))
|
|
32
|
+
non_code_lines = []
|
|
33
|
+
|
|
25
34
|
language = self._extract_language(line)
|
|
26
35
|
processed_lines.append(f"```{language}")
|
|
27
36
|
in_code_block = True
|
|
28
37
|
elif in_code_block:
|
|
29
38
|
code_block_lines.append(line)
|
|
30
39
|
else:
|
|
31
|
-
|
|
40
|
+
non_code_lines.append(line)
|
|
41
|
+
|
|
42
|
+
# Format remaining non-code lines at the end
|
|
43
|
+
if non_code_lines:
|
|
44
|
+
processed_lines.extend(self._format_code_block(non_code_lines))
|
|
32
45
|
|
|
33
46
|
return "\n".join(processed_lines)
|
|
34
47
|
|
|
@@ -39,12 +52,6 @@ class WhitespacePreProcessor(PreProcessor):
|
|
|
39
52
|
return fence_line.lstrip().removeprefix("```").strip()
|
|
40
53
|
|
|
41
54
|
def _format_code_block(self, lines: list[str]) -> list[str]:
|
|
42
|
-
"""
|
|
43
|
-
Format code block by removing common leading whitespace.
|
|
44
|
-
|
|
45
|
-
Preserves relative indentation between lines.
|
|
46
|
-
Empty lines are preserved as-is.
|
|
47
|
-
"""
|
|
48
55
|
if not lines:
|
|
49
56
|
return []
|
|
50
57
|
|
|
@@ -33,6 +33,8 @@ class MarkdownToNotionConverter(LoggingMixin):
|
|
|
33
33
|
parent_stack: list[ParentBlockContext] = []
|
|
34
34
|
|
|
35
35
|
current_line_index = 0
|
|
36
|
+
previous_line_was_empty = False
|
|
37
|
+
|
|
36
38
|
while current_line_index < len(lines):
|
|
37
39
|
line = lines[current_line_index]
|
|
38
40
|
|
|
@@ -42,9 +44,11 @@ class MarkdownToNotionConverter(LoggingMixin):
|
|
|
42
44
|
line_index=current_line_index,
|
|
43
45
|
result_blocks=result_blocks,
|
|
44
46
|
parent_stack=parent_stack,
|
|
47
|
+
is_previous_line_empty=previous_line_was_empty,
|
|
45
48
|
)
|
|
46
49
|
|
|
47
50
|
await self._line_parser.handle(context)
|
|
51
|
+
previous_line_was_empty = self._is_processed_line_empty(line)
|
|
48
52
|
|
|
49
53
|
current_line_index += 1 + context.lines_consumed
|
|
50
54
|
|
|
@@ -57,6 +61,7 @@ class MarkdownToNotionConverter(LoggingMixin):
|
|
|
57
61
|
line_index: int,
|
|
58
62
|
result_blocks: list[BlockCreatePayload],
|
|
59
63
|
parent_stack: list[ParentBlockContext],
|
|
64
|
+
is_previous_line_empty: bool = False,
|
|
60
65
|
) -> BlockParsingContext:
|
|
61
66
|
return BlockParsingContext(
|
|
62
67
|
line=line,
|
|
@@ -66,4 +71,8 @@ class MarkdownToNotionConverter(LoggingMixin):
|
|
|
66
71
|
all_lines=lines,
|
|
67
72
|
current_line_index=line_index,
|
|
68
73
|
lines_consumed=0,
|
|
74
|
+
is_previous_line_empty=is_previous_line_empty,
|
|
69
75
|
)
|
|
76
|
+
|
|
77
|
+
def _is_processed_line_empty(self, line: str) -> bool:
|
|
78
|
+
return line.strip() == ""
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from collections.abc import Awaitable, Callable
|
|
2
2
|
|
|
3
3
|
from notionary.blocks.schemas import Block
|
|
4
|
+
from notionary.page.content.syntax.grammar import MarkdownGrammar
|
|
4
5
|
|
|
5
6
|
ConvertChildrenCallback = Callable[[list[Block], int], Awaitable[str]]
|
|
6
7
|
|
|
@@ -11,13 +12,15 @@ class MarkdownRenderingContext:
|
|
|
11
12
|
block: Block,
|
|
12
13
|
indent_level: int,
|
|
13
14
|
convert_children_callback: ConvertChildrenCallback | None = None,
|
|
15
|
+
markdown_grammar: MarkdownGrammar | None = None,
|
|
14
16
|
) -> None:
|
|
15
17
|
self.block = block
|
|
16
18
|
self.indent_level = indent_level
|
|
17
19
|
self.convert_children_callback = convert_children_callback
|
|
20
|
+
markdown_grammar = markdown_grammar or MarkdownGrammar()
|
|
21
|
+
self._spaces_per_nesting_level = markdown_grammar.spaces_per_nesting_level
|
|
18
22
|
|
|
19
23
|
self.markdown_result: str | None = None
|
|
20
|
-
self._markdown_indentation_multiplier = 4
|
|
21
24
|
|
|
22
25
|
async def render_children(self) -> str:
|
|
23
26
|
return await self._convert_children_to_markdown(self.indent_level)
|
|
@@ -43,6 +46,6 @@ class MarkdownRenderingContext:
|
|
|
43
46
|
if not text:
|
|
44
47
|
return text
|
|
45
48
|
|
|
46
|
-
spaces = " " * self.
|
|
49
|
+
spaces = " " * self._spaces_per_nesting_level * self.indent_level
|
|
47
50
|
lines = text.split("\n")
|
|
48
51
|
return "\n".join(f"{spaces}{line}" if line.strip() else line for line in lines)
|
|
@@ -26,11 +26,10 @@ from notionary.page.content.renderer.renderers import (
|
|
|
26
26
|
TableRenderer,
|
|
27
27
|
TableRowHandler,
|
|
28
28
|
TodoRenderer,
|
|
29
|
-
ToggleableHeadingRenderer,
|
|
30
29
|
ToggleRenderer,
|
|
31
30
|
VideoRenderer,
|
|
32
31
|
)
|
|
33
|
-
from notionary.page.content.syntax
|
|
32
|
+
from notionary.page.content.syntax import SyntaxRegistry
|
|
34
33
|
|
|
35
34
|
|
|
36
35
|
class RendererChainFactory:
|
|
@@ -45,7 +44,6 @@ class RendererChainFactory:
|
|
|
45
44
|
def create(self) -> BlockRenderer:
|
|
46
45
|
# Strukturelle Blocks
|
|
47
46
|
toggle_handler = self._create_toggle_renderer()
|
|
48
|
-
toggleable_heading_handler = self._create_toggleable_heading_renderer()
|
|
49
47
|
heading_handler = self._create_heading_renderer()
|
|
50
48
|
|
|
51
49
|
# Content Blocks
|
|
@@ -83,8 +81,7 @@ class RendererChainFactory:
|
|
|
83
81
|
|
|
84
82
|
# Chain verketten - most specific first, fallback last
|
|
85
83
|
(
|
|
86
|
-
toggle_handler.set_next(
|
|
87
|
-
.set_next(heading_handler)
|
|
84
|
+
toggle_handler.set_next(heading_handler)
|
|
88
85
|
.set_next(callout_handler)
|
|
89
86
|
.set_next(code_handler)
|
|
90
87
|
.set_next(quote_handler)
|
|
@@ -119,12 +116,6 @@ class RendererChainFactory:
|
|
|
119
116
|
rich_text_markdown_converter=self._rich_text_markdown_converter,
|
|
120
117
|
)
|
|
121
118
|
|
|
122
|
-
def _create_toggleable_heading_renderer(self) -> ToggleableHeadingRenderer:
|
|
123
|
-
return ToggleableHeadingRenderer(
|
|
124
|
-
syntax_registry=self._syntax_registry,
|
|
125
|
-
rich_text_markdown_converter=self._rich_text_markdown_converter,
|
|
126
|
-
)
|
|
127
|
-
|
|
128
119
|
def _create_heading_renderer(self) -> HeadingRenderer:
|
|
129
120
|
return HeadingRenderer(
|
|
130
121
|
syntax_registry=self._syntax_registry,
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from .
|
|
1
|
+
from .numbered_list import NumberedListPlaceholderReplacerPostProcessor
|
|
2
2
|
|
|
3
3
|
__all__ = [
|
|
4
|
-
"
|
|
4
|
+
"NumberedListPlaceholderReplacerPostProcessor",
|
|
5
5
|
]
|