deepresearch-flow 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/paper/cli.py +63 -0
- deepresearch_flow/paper/config.py +87 -12
- deepresearch_flow/paper/db.py +1154 -35
- deepresearch_flow/paper/db_ops.py +124 -19
- deepresearch_flow/paper/extract.py +1546 -152
- deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +5 -0
- deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +1 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
- deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
- deepresearch_flow/paper/providers/azure_openai.py +45 -3
- deepresearch_flow/paper/providers/openai_compatible.py +45 -3
- deepresearch_flow/paper/schemas/deep_read_phi_schema.json +1 -0
- deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
- deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
- deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
- deepresearch_flow/paper/snapshot/__init__.py +4 -0
- deepresearch_flow/paper/snapshot/api.py +941 -0
- deepresearch_flow/paper/snapshot/builder.py +965 -0
- deepresearch_flow/paper/snapshot/identity.py +239 -0
- deepresearch_flow/paper/snapshot/schema.py +245 -0
- deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
- deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
- deepresearch_flow/paper/snapshot/text.py +154 -0
- deepresearch_flow/paper/template_registry.py +1 -0
- deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
- deepresearch_flow/paper/templates/deep_read_phi.md.j2 +4 -0
- deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
- deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
- deepresearch_flow/paper/web/app.py +10 -3
- deepresearch_flow/recognize/cli.py +380 -103
- deepresearch_flow/recognize/markdown.py +31 -7
- deepresearch_flow/recognize/math.py +47 -12
- deepresearch_flow/recognize/mermaid.py +320 -10
- deepresearch_flow/recognize/organize.py +29 -7
- deepresearch_flow/translator/cli.py +71 -20
- deepresearch_flow/translator/engine.py +220 -81
- deepresearch_flow/translator/prompts.py +19 -2
- deepresearch_flow/translator/protector.py +15 -3
- deepresearch_flow-0.6.1.dist-info/METADATA +849 -0
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/RECORD +51 -43
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/WHEEL +1 -1
- deepresearch_flow-0.5.1.dist-info/METADATA +0 -440
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/top_level.txt +0 -0
deepresearch_flow/paper/cli.py
CHANGED
|
@@ -67,9 +67,40 @@ def paper() -> None:
|
|
|
67
67
|
@click.option("--split", is_flag=True, help="Write per-document JSON outputs")
|
|
68
68
|
@click.option("--split-dir", "split_dir", default=None, help="Directory for split outputs")
|
|
69
69
|
@click.option("--force", is_flag=True, help="Force re-extraction")
|
|
70
|
+
@click.option(
|
|
71
|
+
"--force-stage",
|
|
72
|
+
"force_stages",
|
|
73
|
+
multiple=True,
|
|
74
|
+
help="Force re-run specific stages (multi-stage templates only)",
|
|
75
|
+
)
|
|
70
76
|
@click.option("--retry-failed", is_flag=True, help="Retry only failed documents")
|
|
77
|
+
@click.option(
|
|
78
|
+
"--retry-failed-stages",
|
|
79
|
+
is_flag=True,
|
|
80
|
+
help="Retry only failed stages per document (multi-stage templates only)",
|
|
81
|
+
)
|
|
82
|
+
@click.option(
|
|
83
|
+
"--retry-list-json",
|
|
84
|
+
"retry_list_json",
|
|
85
|
+
default=None,
|
|
86
|
+
help="Retry only documents listed in a verification report",
|
|
87
|
+
)
|
|
88
|
+
@click.option(
|
|
89
|
+
"--stage-dag",
|
|
90
|
+
is_flag=True,
|
|
91
|
+
help="Enable dependency-aware DAG scheduling (multi-stage templates only)",
|
|
92
|
+
)
|
|
93
|
+
@click.option("--start-idx", "start_idx", type=int, default=0, help="Start index for inputs")
|
|
94
|
+
@click.option(
|
|
95
|
+
"--end-idx",
|
|
96
|
+
"end_idx",
|
|
97
|
+
type=int,
|
|
98
|
+
default=-1,
|
|
99
|
+
help="End index (exclusive); -1 means to the last item",
|
|
100
|
+
)
|
|
71
101
|
@click.option("--dry-run", is_flag=True, help="Discover inputs without calling providers")
|
|
72
102
|
@click.option("--max-concurrency", "max_concurrency", type=int, default=None, help="Override max concurrency")
|
|
103
|
+
@click.option("--timeout", "timeout_seconds", type=float, default=None, help="Request timeout in seconds")
|
|
73
104
|
@click.option("--sleep-every", "sleep_every", type=int, default=None, help="Sleep after every N requests")
|
|
74
105
|
@click.option("--sleep-time", "sleep_time", type=float, default=None, help="Sleep duration in seconds")
|
|
75
106
|
@click.option("--render-md", "render_md", is_flag=True, help="Render markdown outputs after extraction")
|
|
@@ -116,9 +147,16 @@ def extract(
|
|
|
116
147
|
split: bool,
|
|
117
148
|
split_dir: str | None,
|
|
118
149
|
force: bool,
|
|
150
|
+
force_stages: tuple[str, ...],
|
|
119
151
|
retry_failed: bool,
|
|
152
|
+
retry_failed_stages: bool,
|
|
153
|
+
retry_list_json: str | None,
|
|
154
|
+
stage_dag: bool,
|
|
155
|
+
start_idx: int,
|
|
156
|
+
end_idx: int,
|
|
120
157
|
dry_run: bool,
|
|
121
158
|
max_concurrency: int | None,
|
|
159
|
+
timeout_seconds: float | None,
|
|
122
160
|
sleep_every: int | None,
|
|
123
161
|
sleep_time: float | None,
|
|
124
162
|
render_md: bool,
|
|
@@ -142,14 +180,28 @@ def extract(
|
|
|
142
180
|
raise click.ClickException("max_concurrency must be positive")
|
|
143
181
|
if config.extract.max_retries <= 0:
|
|
144
182
|
raise click.ClickException("max_retries must be positive")
|
|
183
|
+
if config.extract.timeout <= 0:
|
|
184
|
+
raise click.ClickException("timeout must be positive")
|
|
145
185
|
if max_concurrency is not None and max_concurrency <= 0:
|
|
146
186
|
raise click.ClickException("--max-concurrency must be positive")
|
|
187
|
+
if timeout_seconds is not None and timeout_seconds <= 0:
|
|
188
|
+
raise click.ClickException("--timeout must be positive")
|
|
147
189
|
if sleep_every is not None and sleep_every <= 0:
|
|
148
190
|
raise click.ClickException("--sleep-every must be positive")
|
|
149
191
|
if sleep_time is not None and sleep_time <= 0:
|
|
150
192
|
raise click.ClickException("--sleep-time must be positive")
|
|
151
193
|
if (sleep_every is None) != (sleep_time is None):
|
|
152
194
|
raise click.ClickException("Both --sleep-every and --sleep-time are required")
|
|
195
|
+
if start_idx < 0:
|
|
196
|
+
raise click.ClickException("--start-idx must be >= 0")
|
|
197
|
+
if end_idx < -1:
|
|
198
|
+
raise click.ClickException("--end-idx must be -1 or >= 0")
|
|
199
|
+
if retry_failed and retry_failed_stages:
|
|
200
|
+
raise click.ClickException("--retry-failed and --retry-failed-stages are mutually exclusive")
|
|
201
|
+
if retry_list_json and (retry_failed or retry_failed_stages):
|
|
202
|
+
raise click.ClickException(
|
|
203
|
+
"--retry-list-json cannot be combined with --retry-failed or --retry-failed-stages"
|
|
204
|
+
)
|
|
153
205
|
|
|
154
206
|
if provider.type in {
|
|
155
207
|
"openai_compatible",
|
|
@@ -171,6 +223,8 @@ def extract(
|
|
|
171
223
|
custom_prompt = bool(prompt_system or prompt_user or template_dir)
|
|
172
224
|
if custom_prompt and prompt_template != "simple":
|
|
173
225
|
raise click.ClickException("Custom prompts cannot be combined with built-in prompt templates")
|
|
226
|
+
if stage_dag and custom_prompt:
|
|
227
|
+
raise click.ClickException("--stage-dag requires a built-in multi-stage prompt template")
|
|
174
228
|
|
|
175
229
|
schema_override = schema_path or None
|
|
176
230
|
prompt_system_path = Path(prompt_system) if prompt_system else None
|
|
@@ -240,7 +294,9 @@ def extract(
|
|
|
240
294
|
|
|
241
295
|
output = Path(output_path or config.extract.output)
|
|
242
296
|
errors = Path(errors_path or config.extract.errors)
|
|
297
|
+
retry_list_path = Path(retry_list_json) if retry_list_json else None
|
|
243
298
|
split_out = Path(split_dir) if split_dir else None
|
|
299
|
+
timeout_seconds_effective = timeout_seconds if timeout_seconds is not None else config.extract.timeout
|
|
244
300
|
|
|
245
301
|
configure_logging(verbose)
|
|
246
302
|
|
|
@@ -258,9 +314,16 @@ def extract(
|
|
|
258
314
|
split=split,
|
|
259
315
|
split_dir=split_out,
|
|
260
316
|
force=force,
|
|
317
|
+
force_stages=list(force_stages),
|
|
261
318
|
retry_failed=retry_failed,
|
|
319
|
+
retry_failed_stages=retry_failed_stages,
|
|
320
|
+
retry_list_path=retry_list_path,
|
|
321
|
+
stage_dag=stage_dag or config.extract.stage_dag,
|
|
322
|
+
start_idx=start_idx,
|
|
323
|
+
end_idx=end_idx,
|
|
262
324
|
dry_run=dry_run,
|
|
263
325
|
max_concurrency_override=max_concurrency,
|
|
326
|
+
timeout_seconds=timeout_seconds_effective,
|
|
264
327
|
prompt_template=prompt_template,
|
|
265
328
|
output_language=output_language,
|
|
266
329
|
custom_prompt=custom_prompt,
|
|
@@ -15,12 +15,15 @@ class ExtractConfig:
|
|
|
15
15
|
errors: str
|
|
16
16
|
max_concurrency: int
|
|
17
17
|
max_retries: int
|
|
18
|
+
timeout: float
|
|
18
19
|
backoff_base_seconds: float
|
|
19
20
|
backoff_max_seconds: float
|
|
21
|
+
pause_threshold_seconds: float
|
|
20
22
|
truncate_strategy: str
|
|
21
23
|
truncate_max_chars: int
|
|
22
24
|
cost_estimate: bool
|
|
23
25
|
schema_path: str | None
|
|
26
|
+
stage_dag: bool
|
|
24
27
|
|
|
25
28
|
|
|
26
29
|
@dataclass(frozen=True)
|
|
@@ -28,12 +31,20 @@ class RenderConfig:
|
|
|
28
31
|
template_path: str | None
|
|
29
32
|
|
|
30
33
|
|
|
34
|
+
@dataclass(frozen=True)
|
|
35
|
+
class ApiKeyConfig:
|
|
36
|
+
key: str
|
|
37
|
+
quota_duration: int | None
|
|
38
|
+
reset_time: str | None
|
|
39
|
+
quota_error_tokens: list[str]
|
|
40
|
+
|
|
41
|
+
|
|
31
42
|
@dataclass(frozen=True)
|
|
32
43
|
class ProviderConfig:
|
|
33
44
|
name: str
|
|
34
45
|
type: str
|
|
35
46
|
base_url: str
|
|
36
|
-
api_keys: list[
|
|
47
|
+
api_keys: list[ApiKeyConfig]
|
|
37
48
|
api_version: str | None
|
|
38
49
|
deployment: str | None
|
|
39
50
|
project_id: str | None
|
|
@@ -60,12 +71,15 @@ DEFAULT_EXTRACT = ExtractConfig(
|
|
|
60
71
|
errors="paper_errors.json",
|
|
61
72
|
max_concurrency=6,
|
|
62
73
|
max_retries=3,
|
|
74
|
+
timeout=60.0,
|
|
63
75
|
backoff_base_seconds=1.0,
|
|
64
76
|
backoff_max_seconds=20.0,
|
|
77
|
+
pause_threshold_seconds=10.0,
|
|
65
78
|
truncate_strategy="head_tail",
|
|
66
79
|
truncate_max_chars=20000,
|
|
67
80
|
cost_estimate=True,
|
|
68
81
|
schema_path=None,
|
|
82
|
+
stage_dag=False,
|
|
69
83
|
)
|
|
70
84
|
|
|
71
85
|
DEFAULT_RENDER = RenderConfig(template_path=None)
|
|
@@ -103,6 +117,51 @@ def _as_str(value: Any, default: str | None = None) -> str | None:
|
|
|
103
117
|
return str(value)
|
|
104
118
|
|
|
105
119
|
|
|
120
|
+
def _parse_api_keys(value: Any) -> list[ApiKeyConfig]:
|
|
121
|
+
if value is None:
|
|
122
|
+
return []
|
|
123
|
+
entries = value if isinstance(value, list) else [value]
|
|
124
|
+
parsed: list[ApiKeyConfig] = []
|
|
125
|
+
for entry in entries:
|
|
126
|
+
if isinstance(entry, dict):
|
|
127
|
+
key = _as_str(entry.get("key"))
|
|
128
|
+
if not key:
|
|
129
|
+
raise ValueError("api_keys object entries must include key")
|
|
130
|
+
quota_duration = entry.get("quota_duration")
|
|
131
|
+
quota_duration_value = int(quota_duration) if quota_duration is not None else None
|
|
132
|
+
if quota_duration_value is not None and quota_duration_value <= 0:
|
|
133
|
+
raise ValueError("quota_duration must be positive seconds")
|
|
134
|
+
reset_time = _as_str(entry.get("reset_time"), None)
|
|
135
|
+
tokens = entry.get("quota_error_tokens")
|
|
136
|
+
if tokens is None:
|
|
137
|
+
quota_error_tokens = []
|
|
138
|
+
elif isinstance(tokens, list):
|
|
139
|
+
quota_error_tokens = [str(token) for token in tokens]
|
|
140
|
+
else:
|
|
141
|
+
quota_error_tokens = [str(tokens)]
|
|
142
|
+
parsed.append(
|
|
143
|
+
ApiKeyConfig(
|
|
144
|
+
key=key,
|
|
145
|
+
quota_duration=quota_duration_value,
|
|
146
|
+
reset_time=reset_time,
|
|
147
|
+
quota_error_tokens=quota_error_tokens,
|
|
148
|
+
)
|
|
149
|
+
)
|
|
150
|
+
else:
|
|
151
|
+
key = _as_str(entry)
|
|
152
|
+
if not key:
|
|
153
|
+
continue
|
|
154
|
+
parsed.append(
|
|
155
|
+
ApiKeyConfig(
|
|
156
|
+
key=key,
|
|
157
|
+
quota_duration=None,
|
|
158
|
+
reset_time=None,
|
|
159
|
+
quota_error_tokens=[],
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
return parsed
|
|
163
|
+
|
|
164
|
+
|
|
106
165
|
def _ensure_http_scheme(base_url: str, *, default_scheme: str = "http://") -> str:
|
|
107
166
|
normalized = base_url.strip()
|
|
108
167
|
if normalized.startswith(("http://", "https://")):
|
|
@@ -125,12 +184,16 @@ def load_config(path: str) -> PaperConfig:
|
|
|
125
184
|
errors=_as_str(extract_data.get("errors"), DEFAULT_EXTRACT.errors) or DEFAULT_EXTRACT.errors,
|
|
126
185
|
max_concurrency=_as_int(extract_data.get("max_concurrency"), DEFAULT_EXTRACT.max_concurrency),
|
|
127
186
|
max_retries=_as_int(extract_data.get("max_retries"), DEFAULT_EXTRACT.max_retries),
|
|
187
|
+
timeout=_as_float(extract_data.get("timeout"), DEFAULT_EXTRACT.timeout),
|
|
128
188
|
backoff_base_seconds=_as_float(
|
|
129
189
|
extract_data.get("backoff_base_seconds"), DEFAULT_EXTRACT.backoff_base_seconds
|
|
130
190
|
),
|
|
131
191
|
backoff_max_seconds=_as_float(
|
|
132
192
|
extract_data.get("backoff_max_seconds"), DEFAULT_EXTRACT.backoff_max_seconds
|
|
133
193
|
),
|
|
194
|
+
pause_threshold_seconds=_as_float(
|
|
195
|
+
extract_data.get("pause_threshold_seconds"), DEFAULT_EXTRACT.pause_threshold_seconds
|
|
196
|
+
),
|
|
134
197
|
truncate_strategy=_as_str(
|
|
135
198
|
extract_data.get("truncate_strategy"), DEFAULT_EXTRACT.truncate_strategy
|
|
136
199
|
)
|
|
@@ -140,6 +203,7 @@ def load_config(path: str) -> PaperConfig:
|
|
|
140
203
|
),
|
|
141
204
|
cost_estimate=_as_bool(extract_data.get("cost_estimate"), DEFAULT_EXTRACT.cost_estimate),
|
|
142
205
|
schema_path=_as_str(extract_data.get("schema_path"), DEFAULT_EXTRACT.schema_path),
|
|
206
|
+
stage_dag=_as_bool(extract_data.get("stage_dag"), DEFAULT_EXTRACT.stage_dag),
|
|
143
207
|
)
|
|
144
208
|
|
|
145
209
|
render_data = data.get("render", {})
|
|
@@ -171,10 +235,10 @@ def load_config(path: str) -> PaperConfig:
|
|
|
171
235
|
if provider_type == "ollama" and base_url:
|
|
172
236
|
base_url = _ensure_http_scheme(base_url)
|
|
173
237
|
|
|
174
|
-
api_keys =
|
|
238
|
+
api_keys = _parse_api_keys(provider.get("api_keys"))
|
|
175
239
|
if not api_keys:
|
|
176
240
|
api_key_single = provider.get("api_key")
|
|
177
|
-
api_keys =
|
|
241
|
+
api_keys = _parse_api_keys(api_key_single)
|
|
178
242
|
|
|
179
243
|
structured_mode = _as_str(provider.get("structured_mode"), None)
|
|
180
244
|
if structured_mode is None:
|
|
@@ -250,15 +314,26 @@ def load_config(path: str) -> PaperConfig:
|
|
|
250
314
|
return PaperConfig(extract=extract, render=render, providers=providers)
|
|
251
315
|
|
|
252
316
|
|
|
253
|
-
def
|
|
254
|
-
resolved: list[
|
|
317
|
+
def resolve_api_key_configs(entries: list[ApiKeyConfig]) -> list[ApiKeyConfig]:
|
|
318
|
+
resolved: list[ApiKeyConfig] = []
|
|
255
319
|
for entry in entries:
|
|
256
|
-
|
|
257
|
-
if
|
|
258
|
-
env_name =
|
|
320
|
+
key = entry.key
|
|
321
|
+
if key.startswith("env:"):
|
|
322
|
+
env_name = key.split(":", 1)[1]
|
|
259
323
|
value = os.environ.get(env_name)
|
|
260
|
-
if value:
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
324
|
+
if not value:
|
|
325
|
+
continue
|
|
326
|
+
key = value
|
|
327
|
+
resolved.append(
|
|
328
|
+
ApiKeyConfig(
|
|
329
|
+
key=key,
|
|
330
|
+
quota_duration=entry.quota_duration,
|
|
331
|
+
reset_time=entry.reset_time,
|
|
332
|
+
quota_error_tokens=entry.quota_error_tokens,
|
|
333
|
+
)
|
|
334
|
+
)
|
|
264
335
|
return resolved
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def resolve_api_keys(entries: list[ApiKeyConfig]) -> list[str]:
|
|
339
|
+
return [entry.key for entry in resolve_api_key_configs(entries)]
|