markdown-flow 0.2.18__py3-none-any.whl → 0.2.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of markdown-flow might be problematic. Click here for more details.
- markdown_flow/__init__.py +3 -4
- markdown_flow/constants.py +47 -40
- markdown_flow/core.py +340 -94
- markdown_flow/llm.py +4 -3
- markdown_flow/models.py +1 -1
- markdown_flow/parser/__init__.py +34 -0
- markdown_flow/parser/interaction.py +354 -0
- markdown_flow/parser/json_parser.py +50 -0
- markdown_flow/parser/output.py +215 -0
- markdown_flow/parser/validation.py +121 -0
- markdown_flow/parser/variable.py +95 -0
- markdown_flow/providers/__init__.py +15 -0
- markdown_flow/providers/config.py +51 -0
- markdown_flow/providers/openai.py +371 -0
- markdown_flow/utils.py +43 -43
- {markdown_flow-0.2.18.dist-info → markdown_flow-0.2.26.dist-info}/METADATA +45 -52
- markdown_flow-0.2.26.dist-info/RECORD +22 -0
- markdown_flow-0.2.18.dist-info/RECORD +0 -13
- {markdown_flow-0.2.18.dist-info → markdown_flow-0.2.26.dist-info}/WHEEL +0 -0
- {markdown_flow-0.2.18.dist-info → markdown_flow-0.2.26.dist-info}/licenses/LICENSE +0 -0
- {markdown_flow-0.2.18.dist-info → markdown_flow-0.2.26.dist-info}/top_level.txt +0 -0
markdown_flow/llm.py
CHANGED
|
@@ -15,7 +15,6 @@ from .constants import NO_LLM_PROVIDER_ERROR
|
|
|
15
15
|
class ProcessMode(Enum):
|
|
16
16
|
"""LLM processing modes."""
|
|
17
17
|
|
|
18
|
-
PROMPT_ONLY = "prompt_only" # Return prompt only, no LLM call
|
|
19
18
|
COMPLETE = "complete" # Complete processing (non-streaming)
|
|
20
19
|
STREAM = "stream" # Streaming processing
|
|
21
20
|
|
|
@@ -43,7 +42,8 @@ class LLMProvider(ABC):
|
|
|
43
42
|
Non-streaming LLM call.
|
|
44
43
|
|
|
45
44
|
Args:
|
|
46
|
-
messages: Message list in format [{"role": "system/user/assistant", "content": "..."}]
|
|
45
|
+
messages: Message list in format [{"role": "system/user/assistant", "content": "..."}].
|
|
46
|
+
This list already includes conversation history context merged by MarkdownFlow.
|
|
47
47
|
|
|
48
48
|
Returns:
|
|
49
49
|
str: LLM response content
|
|
@@ -58,7 +58,8 @@ class LLMProvider(ABC):
|
|
|
58
58
|
Streaming LLM call.
|
|
59
59
|
|
|
60
60
|
Args:
|
|
61
|
-
messages: Message list in format [{"role": "system/user/assistant", "content": "..."}]
|
|
61
|
+
messages: Message list in format [{"role": "system/user/assistant", "content": "..."}].
|
|
62
|
+
This list already includes conversation history context merged by MarkdownFlow.
|
|
62
63
|
|
|
63
64
|
Yields:
|
|
64
65
|
str: Incremental LLM response content
|
markdown_flow/models.py
CHANGED
|
@@ -7,7 +7,7 @@ Simplified and refactored data models focused on core functionality.
|
|
|
7
7
|
from dataclasses import dataclass, field
|
|
8
8
|
|
|
9
9
|
from .enums import BlockType, InputType
|
|
10
|
-
from .
|
|
10
|
+
from .parser import extract_variables_from_text
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
@dataclass
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Markdown-Flow Parser Module
|
|
3
|
+
|
|
4
|
+
Provides specialized parsers for different aspects of MarkdownFlow document processing.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .interaction import InteractionParser, InteractionType, extract_interaction_question
|
|
8
|
+
from .json_parser import parse_json_response
|
|
9
|
+
from .output import (
|
|
10
|
+
extract_preserved_content,
|
|
11
|
+
is_preserved_content_block,
|
|
12
|
+
process_output_instructions,
|
|
13
|
+
)
|
|
14
|
+
from .validation import generate_smart_validation_template, parse_validation_response
|
|
15
|
+
from .variable import extract_variables_from_text, replace_variables_in_text
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
# Variable parsing
|
|
19
|
+
"extract_variables_from_text",
|
|
20
|
+
"replace_variables_in_text",
|
|
21
|
+
# Interaction parsing
|
|
22
|
+
"InteractionParser",
|
|
23
|
+
"InteractionType",
|
|
24
|
+
"extract_interaction_question",
|
|
25
|
+
# Output and preserved content
|
|
26
|
+
"is_preserved_content_block",
|
|
27
|
+
"extract_preserved_content",
|
|
28
|
+
"process_output_instructions",
|
|
29
|
+
# Validation
|
|
30
|
+
"generate_smart_validation_template",
|
|
31
|
+
"parse_validation_response",
|
|
32
|
+
# JSON parsing
|
|
33
|
+
"parse_json_response",
|
|
34
|
+
]
|
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Interaction Parser Module
|
|
3
|
+
|
|
4
|
+
Provides three-layer interaction parsing for MarkdownFlow ?[] format validation,
|
|
5
|
+
variable detection, and content parsing.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from ..constants import (
|
|
12
|
+
COMPILED_INTERACTION_REGEX,
|
|
13
|
+
COMPILED_LAYER1_INTERACTION_REGEX,
|
|
14
|
+
COMPILED_LAYER2_VARIABLE_REGEX,
|
|
15
|
+
COMPILED_LAYER3_ELLIPSIS_REGEX,
|
|
16
|
+
COMPILED_SINGLE_PIPE_SPLIT_REGEX,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class InteractionType(Enum):
|
|
21
|
+
"""Interaction input type enumeration."""
|
|
22
|
+
|
|
23
|
+
TEXT_ONLY = "text_only" # Text input only: ?[%{{var}}...question]
|
|
24
|
+
BUTTONS_ONLY = "buttons_only" # Button selection only: ?[%{{var}} A|B]
|
|
25
|
+
BUTTONS_WITH_TEXT = "buttons_with_text" # Buttons + text: ?[%{{var}} A|B|...question]
|
|
26
|
+
BUTTONS_MULTI_SELECT = "buttons_multi_select" # Multi-select buttons: ?[%{{var}} A||B]
|
|
27
|
+
BUTTONS_MULTI_WITH_TEXT = "buttons_multi_with_text" # Multi-select + text: ?[%{{var}} A||B||...question]
|
|
28
|
+
NON_ASSIGNMENT_BUTTON = "non_assignment_button" # Display buttons: ?[Continue|Cancel]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def extract_interaction_question(content: str) -> str | None:
|
|
32
|
+
"""
|
|
33
|
+
Extract question text from interaction block content.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
content: Raw interaction block content
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Question text if found, None otherwise
|
|
40
|
+
"""
|
|
41
|
+
# Match interaction format: ?[...] using pre-compiled regex
|
|
42
|
+
match = COMPILED_INTERACTION_REGEX.match(content.strip())
|
|
43
|
+
if not match:
|
|
44
|
+
return None # type: ignore[unreachable]
|
|
45
|
+
|
|
46
|
+
# Extract interaction content (remove ?[ and ])
|
|
47
|
+
interaction_content = match.group(1) if match.groups() else match.group(0)[2:-1]
|
|
48
|
+
|
|
49
|
+
# Find ... separator, question text follows
|
|
50
|
+
if "..." in interaction_content:
|
|
51
|
+
# Split and get question part
|
|
52
|
+
parts = interaction_content.split("...", 1)
|
|
53
|
+
if len(parts) > 1:
|
|
54
|
+
return parts[1].strip()
|
|
55
|
+
|
|
56
|
+
return None # type: ignore[unreachable]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class InteractionParser:
|
|
60
|
+
"""
|
|
61
|
+
Three-layer interaction parser for ?[] format validation,
|
|
62
|
+
variable detection, and content parsing.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
def __init__(self):
|
|
66
|
+
"""Initialize parser."""
|
|
67
|
+
|
|
68
|
+
def parse(self, content: str) -> dict[str, Any]:
|
|
69
|
+
"""
|
|
70
|
+
Main parsing method.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
content: Raw interaction block content
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Standardized parsing result with type, variable, buttons, and question fields
|
|
77
|
+
"""
|
|
78
|
+
try:
|
|
79
|
+
# Layer 1: Validate basic format
|
|
80
|
+
inner_content = self._layer1_validate_format(content)
|
|
81
|
+
if inner_content is None:
|
|
82
|
+
return self._create_error_result(f"Invalid interaction format: {content}")
|
|
83
|
+
|
|
84
|
+
# Layer 2: Variable detection and pattern classification
|
|
85
|
+
has_variable, variable_name, remaining_content = self._layer2_detect_variable(inner_content)
|
|
86
|
+
|
|
87
|
+
# Layer 3: Specific content parsing
|
|
88
|
+
if has_variable:
|
|
89
|
+
assert variable_name is not None, "variable_name should not be None when has_variable is True"
|
|
90
|
+
return self._layer3_parse_variable_interaction(variable_name, remaining_content)
|
|
91
|
+
return self._layer3_parse_display_buttons(inner_content)
|
|
92
|
+
|
|
93
|
+
except Exception as e:
|
|
94
|
+
return self._create_error_result(f"Parsing error: {str(e)}")
|
|
95
|
+
|
|
96
|
+
def _layer1_validate_format(self, content: str) -> str | None:
|
|
97
|
+
"""
|
|
98
|
+
Layer 1: Validate ?[] format and extract content.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
content: Raw content
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Extracted bracket content, None if validation fails
|
|
105
|
+
"""
|
|
106
|
+
content = content.strip()
|
|
107
|
+
match = COMPILED_LAYER1_INTERACTION_REGEX.search(content)
|
|
108
|
+
|
|
109
|
+
if not match:
|
|
110
|
+
return None # type: ignore[unreachable]
|
|
111
|
+
|
|
112
|
+
# Ensure matched content is complete (no other text)
|
|
113
|
+
matched_text = match.group(0)
|
|
114
|
+
if matched_text.strip() != content:
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
return match.group(1)
|
|
118
|
+
|
|
119
|
+
def _layer2_detect_variable(self, inner_content: str) -> tuple[bool, str | None, str]:
|
|
120
|
+
"""
|
|
121
|
+
Layer 2: Detect variables and classify patterns.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
inner_content: Content extracted from layer 1
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Tuple of (has_variable, variable_name, remaining_content)
|
|
128
|
+
"""
|
|
129
|
+
match = COMPILED_LAYER2_VARIABLE_REGEX.match(inner_content)
|
|
130
|
+
|
|
131
|
+
if not match:
|
|
132
|
+
# No variable, use entire content for display button parsing
|
|
133
|
+
return False, None, inner_content # type: ignore[unreachable]
|
|
134
|
+
|
|
135
|
+
variable_name = match.group(1).strip()
|
|
136
|
+
remaining_content = match.group(2).strip()
|
|
137
|
+
|
|
138
|
+
return True, variable_name, remaining_content
|
|
139
|
+
|
|
140
|
+
def _layer3_parse_variable_interaction(self, variable_name: str, content: str) -> dict[str, Any]:
|
|
141
|
+
"""
|
|
142
|
+
Layer 3: Parse variable interactions (variable assignment type).
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
variable_name: Variable name
|
|
146
|
+
content: Content after variable
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
Parsing result dictionary
|
|
150
|
+
"""
|
|
151
|
+
# Detect ... separator
|
|
152
|
+
ellipsis_match = COMPILED_LAYER3_ELLIPSIS_REGEX.match(content)
|
|
153
|
+
|
|
154
|
+
if ellipsis_match:
|
|
155
|
+
# Has ... separator
|
|
156
|
+
before_ellipsis = ellipsis_match.group(1).strip()
|
|
157
|
+
question = ellipsis_match.group(2).strip()
|
|
158
|
+
|
|
159
|
+
if before_ellipsis:
|
|
160
|
+
# Has prefix content (buttons or single option) + text input
|
|
161
|
+
buttons, is_multi_select = self._parse_buttons(before_ellipsis)
|
|
162
|
+
interaction_type = InteractionType.BUTTONS_MULTI_WITH_TEXT if is_multi_select else InteractionType.BUTTONS_WITH_TEXT
|
|
163
|
+
return {
|
|
164
|
+
"type": interaction_type,
|
|
165
|
+
"variable": variable_name,
|
|
166
|
+
"buttons": buttons,
|
|
167
|
+
"question": question,
|
|
168
|
+
"is_multi_select": is_multi_select,
|
|
169
|
+
}
|
|
170
|
+
# Pure text input
|
|
171
|
+
return {
|
|
172
|
+
"type": InteractionType.TEXT_ONLY,
|
|
173
|
+
"variable": variable_name,
|
|
174
|
+
"question": question,
|
|
175
|
+
"is_multi_select": False,
|
|
176
|
+
}
|
|
177
|
+
# No ... separator
|
|
178
|
+
if ("|" in content or "||" in content) and content: # type: ignore[unreachable]
|
|
179
|
+
# Pure button group
|
|
180
|
+
buttons, is_multi_select = self._parse_buttons(content)
|
|
181
|
+
interaction_type = InteractionType.BUTTONS_MULTI_SELECT if is_multi_select else InteractionType.BUTTONS_ONLY
|
|
182
|
+
return {
|
|
183
|
+
"type": interaction_type,
|
|
184
|
+
"variable": variable_name,
|
|
185
|
+
"buttons": buttons,
|
|
186
|
+
"is_multi_select": is_multi_select,
|
|
187
|
+
}
|
|
188
|
+
if content: # type: ignore[unreachable]
|
|
189
|
+
# Single button
|
|
190
|
+
button = self._parse_single_button(content)
|
|
191
|
+
return {
|
|
192
|
+
"type": InteractionType.BUTTONS_ONLY,
|
|
193
|
+
"variable": variable_name,
|
|
194
|
+
"buttons": [button],
|
|
195
|
+
"is_multi_select": False,
|
|
196
|
+
}
|
|
197
|
+
# Pure text input (no hint)
|
|
198
|
+
return {
|
|
199
|
+
"type": InteractionType.TEXT_ONLY,
|
|
200
|
+
"variable": variable_name,
|
|
201
|
+
"question": "",
|
|
202
|
+
"is_multi_select": False,
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
def _layer3_parse_display_buttons(self, content: str) -> dict[str, Any]:
|
|
206
|
+
"""
|
|
207
|
+
Layer 3: Parse display buttons (non-variable assignment type).
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
content: Content to parse
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
Parsing result dictionary
|
|
214
|
+
"""
|
|
215
|
+
if not content:
|
|
216
|
+
# Empty content: ?[]
|
|
217
|
+
return {
|
|
218
|
+
"type": InteractionType.NON_ASSIGNMENT_BUTTON,
|
|
219
|
+
"buttons": [{"display": "", "value": ""}],
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
if "|" in content:
|
|
223
|
+
# Multiple buttons
|
|
224
|
+
buttons, _ = self._parse_buttons(content) # Display buttons don't use multi-select
|
|
225
|
+
return {"type": InteractionType.NON_ASSIGNMENT_BUTTON, "buttons": buttons}
|
|
226
|
+
# Single button
|
|
227
|
+
button = self._parse_single_button(content)
|
|
228
|
+
return {"type": InteractionType.NON_ASSIGNMENT_BUTTON, "buttons": [button]}
|
|
229
|
+
|
|
230
|
+
def _parse_buttons(self, content: str) -> tuple[list[dict[str, str]], bool]:
|
|
231
|
+
"""
|
|
232
|
+
Parse button group with fault tolerance.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
content: Button content separated by | or ||
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
Tuple of (button list, is_multi_select)
|
|
239
|
+
"""
|
|
240
|
+
if not content or not isinstance(content, str):
|
|
241
|
+
return [], False
|
|
242
|
+
|
|
243
|
+
_, is_multi_select = self._detect_separator_type(content)
|
|
244
|
+
|
|
245
|
+
buttons = []
|
|
246
|
+
try:
|
|
247
|
+
# Use different splitting logic based on separator type
|
|
248
|
+
if is_multi_select:
|
|
249
|
+
# Multi-select mode: split on ||, preserve single |
|
|
250
|
+
button_parts = content.split("||")
|
|
251
|
+
else:
|
|
252
|
+
# Single-select mode: split on single |, but preserve ||
|
|
253
|
+
# Use pre-compiled regex from constants
|
|
254
|
+
button_parts = COMPILED_SINGLE_PIPE_SPLIT_REGEX.split(content)
|
|
255
|
+
|
|
256
|
+
for button_text in button_parts:
|
|
257
|
+
button_text = button_text.strip()
|
|
258
|
+
if button_text:
|
|
259
|
+
button = self._parse_single_button(button_text)
|
|
260
|
+
buttons.append(button)
|
|
261
|
+
except (TypeError, ValueError):
|
|
262
|
+
# Fallback to treating entire content as single button
|
|
263
|
+
return [{"display": content.strip(), "value": content.strip()}], False
|
|
264
|
+
|
|
265
|
+
# For empty content (like just separators), return empty list
|
|
266
|
+
if not buttons and (content.strip() == "||" or content.strip() == "|"):
|
|
267
|
+
return [], is_multi_select
|
|
268
|
+
|
|
269
|
+
# Ensure at least one button exists (but only if there's actual content)
|
|
270
|
+
if not buttons and content.strip():
|
|
271
|
+
buttons = [{"display": content.strip(), "value": content.strip()}]
|
|
272
|
+
|
|
273
|
+
return buttons, is_multi_select
|
|
274
|
+
|
|
275
|
+
def _parse_single_button(self, button_text: str) -> dict[str, str]:
|
|
276
|
+
"""
|
|
277
|
+
Parse single button with fault tolerance, supports Button//value format.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
button_text: Button text
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
Dictionary with display and value keys
|
|
284
|
+
"""
|
|
285
|
+
if not button_text or not isinstance(button_text, str):
|
|
286
|
+
return {"display": "", "value": ""}
|
|
287
|
+
|
|
288
|
+
button_text = button_text.strip()
|
|
289
|
+
if not button_text:
|
|
290
|
+
return {"display": "", "value": ""}
|
|
291
|
+
|
|
292
|
+
try:
|
|
293
|
+
# Detect Button//value format - split only on first //
|
|
294
|
+
if "//" in button_text:
|
|
295
|
+
parts = button_text.split("//", 1) # Split only on first //
|
|
296
|
+
display = parts[0].strip()
|
|
297
|
+
value = parts[1] if len(parts) > 1 else ""
|
|
298
|
+
# Don't strip value to preserve intentional spacing/formatting
|
|
299
|
+
return {"display": display, "value": value}
|
|
300
|
+
except (ValueError, IndexError):
|
|
301
|
+
# Fallback: use text as both display and value
|
|
302
|
+
pass
|
|
303
|
+
|
|
304
|
+
return {"display": button_text, "value": button_text}
|
|
305
|
+
|
|
306
|
+
def _detect_separator_type(self, content: str) -> tuple[str, bool]:
|
|
307
|
+
"""
|
|
308
|
+
Detect separator type and whether it's multi-select.
|
|
309
|
+
|
|
310
|
+
Implements fault tolerance: first separator type encountered determines the behavior.
|
|
311
|
+
Mixed separators are handled by treating the rest as literal text.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
content: Button content to analyze
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
Tuple of (separator, is_multi_select) where separator is '|' or '||'
|
|
318
|
+
"""
|
|
319
|
+
if not content or not isinstance(content, str):
|
|
320
|
+
return "|", False
|
|
321
|
+
|
|
322
|
+
# Find first occurrence of separators
|
|
323
|
+
single_pos = content.find("|")
|
|
324
|
+
double_pos = content.find("||")
|
|
325
|
+
|
|
326
|
+
# If no separators found
|
|
327
|
+
if single_pos == -1 and double_pos == -1:
|
|
328
|
+
return "|", False
|
|
329
|
+
|
|
330
|
+
# If only single separator found
|
|
331
|
+
if double_pos == -1:
|
|
332
|
+
return "|", False
|
|
333
|
+
|
|
334
|
+
# If only double separator found
|
|
335
|
+
if single_pos == -1:
|
|
336
|
+
return "||", True
|
|
337
|
+
|
|
338
|
+
# Both found - fault tolerance: first occurrence wins
|
|
339
|
+
# This handles mixed cases like "A||B|C" (multi-select) and "A|B||C" (single-select)
|
|
340
|
+
if double_pos <= single_pos:
|
|
341
|
+
return "||", True
|
|
342
|
+
return "|", False
|
|
343
|
+
|
|
344
|
+
def _create_error_result(self, error_message: str) -> dict[str, Any]:
|
|
345
|
+
"""
|
|
346
|
+
Create error result.
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
error_message: Error message
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
Error result dictionary
|
|
353
|
+
"""
|
|
354
|
+
return {"type": None, "error": error_message} # type: ignore[unreachable]
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""
|
|
2
|
+
JSON Parser Module
|
|
3
|
+
|
|
4
|
+
Provides robust JSON parsing with support for code blocks and mixed text formats.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import re
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from ..constants import JSON_PARSE_ERROR
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def parse_json_response(response_text: str) -> dict[str, Any]:
|
|
15
|
+
"""
|
|
16
|
+
Parse JSON response supporting multiple formats.
|
|
17
|
+
|
|
18
|
+
Supports pure JSON strings, ```json code blocks, and mixed text formats.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
response_text: Response text to parse
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Parsed dictionary object
|
|
25
|
+
|
|
26
|
+
Raises:
|
|
27
|
+
ValueError: When JSON cannot be parsed
|
|
28
|
+
"""
|
|
29
|
+
text = response_text.strip()
|
|
30
|
+
|
|
31
|
+
# Extract JSON code block
|
|
32
|
+
if "```json" in text:
|
|
33
|
+
start_idx = text.find("```json") + 7
|
|
34
|
+
end_idx = text.find("```", start_idx)
|
|
35
|
+
if end_idx != -1:
|
|
36
|
+
text = text[start_idx:end_idx].strip()
|
|
37
|
+
elif "```" in text:
|
|
38
|
+
start_idx = text.find("```") + 3
|
|
39
|
+
end_idx = text.find("```", start_idx)
|
|
40
|
+
if end_idx != -1:
|
|
41
|
+
text = text[start_idx:end_idx].strip()
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
return json.loads(text)
|
|
45
|
+
except json.JSONDecodeError:
|
|
46
|
+
# Try to extract first JSON object
|
|
47
|
+
json_match = re.search(r"\{[^}]+\}", text)
|
|
48
|
+
if json_match:
|
|
49
|
+
return json.loads(json_match.group())
|
|
50
|
+
raise ValueError(JSON_PARSE_ERROR)
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Output Parser Module
|
|
3
|
+
|
|
4
|
+
Handles output instructions and preserved content processing for MarkdownFlow documents.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
from ..constants import (
|
|
10
|
+
COMPILED_INLINE_PRESERVE_REGEX,
|
|
11
|
+
COMPILED_PRESERVE_FENCE_REGEX,
|
|
12
|
+
OUTPUT_INSTRUCTION_PREFIX,
|
|
13
|
+
OUTPUT_INSTRUCTION_SUFFIX,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def is_preserved_content_block(content: str) -> bool:
|
|
18
|
+
"""
|
|
19
|
+
Check if content is completely preserved content block.
|
|
20
|
+
|
|
21
|
+
Preserved blocks are entirely wrapped by markers with no external content.
|
|
22
|
+
Supports inline (===content===), multiline (!=== ... !===) formats, and mixed formats.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
content: Content to check
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
True if content is fully wrapped by preserved markers
|
|
29
|
+
"""
|
|
30
|
+
content = content.strip()
|
|
31
|
+
if not content:
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
lines = content.split("\n")
|
|
35
|
+
|
|
36
|
+
# Use state machine to validate that all non-empty content is preserved
|
|
37
|
+
state = "OUTSIDE" # States: OUTSIDE, INSIDE
|
|
38
|
+
has_preserve_content = False
|
|
39
|
+
|
|
40
|
+
for line in lines:
|
|
41
|
+
stripped_line = line.strip()
|
|
42
|
+
|
|
43
|
+
# Check if this line is a fence marker (!===)
|
|
44
|
+
if COMPILED_PRESERVE_FENCE_REGEX.match(stripped_line):
|
|
45
|
+
if state == "OUTSIDE":
|
|
46
|
+
# Enter preserve block
|
|
47
|
+
state = "INSIDE"
|
|
48
|
+
has_preserve_content = True
|
|
49
|
+
elif state == "INSIDE":
|
|
50
|
+
# Exit preserve block
|
|
51
|
+
state = "OUTSIDE"
|
|
52
|
+
# Fence markers themselves are valid preserved content
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
# Non-fence lines
|
|
56
|
+
if stripped_line: # Non-empty line
|
|
57
|
+
if state == "INSIDE":
|
|
58
|
+
# Inside fence block, this is valid preserved content
|
|
59
|
+
has_preserve_content = True
|
|
60
|
+
else:
|
|
61
|
+
# Outside fence block, check if it's inline format
|
|
62
|
+
match = COMPILED_INLINE_PRESERVE_REGEX.match(stripped_line)
|
|
63
|
+
if match:
|
|
64
|
+
# Ensure inner content exists and contains no ===
|
|
65
|
+
inner_content = match.group(1).strip()
|
|
66
|
+
if inner_content and "===" not in inner_content:
|
|
67
|
+
# Valid inline format
|
|
68
|
+
has_preserve_content = True
|
|
69
|
+
else:
|
|
70
|
+
# Invalid inline format
|
|
71
|
+
return False
|
|
72
|
+
else:
|
|
73
|
+
# Not fence, not inline format -> external content
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
# Judgment conditions:
|
|
77
|
+
# 1. Must have preserved content
|
|
78
|
+
# 2. Final state must be OUTSIDE (all fence blocks closed)
|
|
79
|
+
return has_preserve_content and state == "OUTSIDE"
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def process_output_instructions(content: str) -> tuple[str, bool]:
|
|
83
|
+
"""
|
|
84
|
+
Process output instruction markers, converting !=== format to [output] format.
|
|
85
|
+
|
|
86
|
+
Uses unified state machine to handle inline (===content===) and multiline (!===...!===) formats.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
content: Raw content containing output instructions
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Tuple of (processed_content, has_preserved_content):
|
|
93
|
+
- processed_content: Content with === and !=== markers converted to XML format
|
|
94
|
+
- has_preserved_content: True if content contained preserved markers
|
|
95
|
+
"""
|
|
96
|
+
lines = content.split("\n")
|
|
97
|
+
result_lines = []
|
|
98
|
+
i = 0
|
|
99
|
+
has_output_instruction = False
|
|
100
|
+
|
|
101
|
+
while i < len(lines):
|
|
102
|
+
line = lines[i]
|
|
103
|
+
|
|
104
|
+
# Check if contains preserved markers (inline ===...=== or multiline !===...)
|
|
105
|
+
# Check inline format first: ===content===
|
|
106
|
+
inline_match = re.search(r"===\s*(.+?)\s*===", line)
|
|
107
|
+
if inline_match and line.count("===") == 2 and not line.strip().startswith("!"):
|
|
108
|
+
inner_content = inline_match.group(1).strip()
|
|
109
|
+
# Validate that inner content doesn't contain ===
|
|
110
|
+
if not inner_content or "===" in inner_content:
|
|
111
|
+
result_lines.append(line)
|
|
112
|
+
i += 1
|
|
113
|
+
continue
|
|
114
|
+
# Process inline format
|
|
115
|
+
full_match = inline_match.group(0)
|
|
116
|
+
|
|
117
|
+
# Build output instruction - keep inline format on same line
|
|
118
|
+
output_instruction = f"{OUTPUT_INSTRUCTION_PREFIX}{inner_content}{OUTPUT_INSTRUCTION_SUFFIX}"
|
|
119
|
+
|
|
120
|
+
# Replace ===...=== part in original line
|
|
121
|
+
processed_line = line.replace(full_match, output_instruction)
|
|
122
|
+
result_lines.append(processed_line)
|
|
123
|
+
has_output_instruction = True
|
|
124
|
+
i += 1
|
|
125
|
+
|
|
126
|
+
elif COMPILED_PRESERVE_FENCE_REGEX.match(line.strip()):
|
|
127
|
+
# Multiline format start
|
|
128
|
+
i += 1
|
|
129
|
+
output_content_lines: list[str] = []
|
|
130
|
+
|
|
131
|
+
# Collect multiline content
|
|
132
|
+
while i < len(lines):
|
|
133
|
+
current_line = lines[i]
|
|
134
|
+
if COMPILED_PRESERVE_FENCE_REGEX.match(current_line.strip()):
|
|
135
|
+
# Found end marker, process collected content
|
|
136
|
+
output_content = "\n".join(output_content_lines).strip()
|
|
137
|
+
|
|
138
|
+
# Special handling for title format (maintain original logic)
|
|
139
|
+
hash_prefix = ""
|
|
140
|
+
if output_content.startswith("#"):
|
|
141
|
+
first_space = output_content.find(" ")
|
|
142
|
+
first_newline = output_content.find("\n")
|
|
143
|
+
|
|
144
|
+
if first_space != -1 and (first_newline == -1 or first_space < first_newline):
|
|
145
|
+
hash_prefix = output_content[: first_space + 1]
|
|
146
|
+
output_content = output_content[first_space + 1 :].strip()
|
|
147
|
+
elif first_newline != -1:
|
|
148
|
+
hash_prefix = output_content[: first_newline + 1]
|
|
149
|
+
output_content = output_content[first_newline + 1 :].strip()
|
|
150
|
+
|
|
151
|
+
# Build output instruction
|
|
152
|
+
if hash_prefix:
|
|
153
|
+
result_lines.append(f"{OUTPUT_INSTRUCTION_PREFIX}{hash_prefix}{output_content}{OUTPUT_INSTRUCTION_SUFFIX}")
|
|
154
|
+
else:
|
|
155
|
+
result_lines.append(f"{OUTPUT_INSTRUCTION_PREFIX}{output_content}{OUTPUT_INSTRUCTION_SUFFIX}")
|
|
156
|
+
|
|
157
|
+
has_output_instruction = True
|
|
158
|
+
i += 1
|
|
159
|
+
break
|
|
160
|
+
# Continue collecting content
|
|
161
|
+
output_content_lines.append(current_line) # type: ignore[unreachable]
|
|
162
|
+
i += 1
|
|
163
|
+
else:
|
|
164
|
+
# No end marker found, rollback processing
|
|
165
|
+
result_lines.append(lines[i - len(output_content_lines) - 1])
|
|
166
|
+
result_lines.extend(output_content_lines)
|
|
167
|
+
else:
|
|
168
|
+
# Normal line
|
|
169
|
+
result_lines.append(line) # type: ignore[unreachable]
|
|
170
|
+
i += 1
|
|
171
|
+
|
|
172
|
+
# Assemble final content
|
|
173
|
+
processed_content = "\n".join(result_lines)
|
|
174
|
+
|
|
175
|
+
# Return both processed content and whether it contains preserved content
|
|
176
|
+
return processed_content, has_output_instruction
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def extract_preserved_content(content: str) -> str:
|
|
180
|
+
"""
|
|
181
|
+
Extract actual content from preserved content blocks, removing markers.
|
|
182
|
+
|
|
183
|
+
Handles inline (===content===) and multiline (!===...!===) formats.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
content: Preserved content containing preserved markers
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
Actual content with === and !=== markers removed
|
|
190
|
+
"""
|
|
191
|
+
content = content.strip()
|
|
192
|
+
if not content:
|
|
193
|
+
return ""
|
|
194
|
+
|
|
195
|
+
lines = content.split("\n")
|
|
196
|
+
result_lines = []
|
|
197
|
+
|
|
198
|
+
for line in lines:
|
|
199
|
+
stripped_line = line.strip()
|
|
200
|
+
|
|
201
|
+
# Check inline format: ===content===
|
|
202
|
+
inline_match = COMPILED_INLINE_PRESERVE_REGEX.match(stripped_line)
|
|
203
|
+
if inline_match:
|
|
204
|
+
# Inline format, extract middle content
|
|
205
|
+
inner_content = inline_match.group(1).strip()
|
|
206
|
+
if inner_content and "===" not in inner_content:
|
|
207
|
+
result_lines.append(inner_content)
|
|
208
|
+
elif COMPILED_PRESERVE_FENCE_REGEX.match(stripped_line): # type: ignore[unreachable]
|
|
209
|
+
# Multiline format delimiter, skip
|
|
210
|
+
continue
|
|
211
|
+
else:
|
|
212
|
+
# Normal content line, keep
|
|
213
|
+
result_lines.append(line)
|
|
214
|
+
|
|
215
|
+
return "\n".join(result_lines)
|