deepresearch-flow 0.5.1__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. deepresearch_flow/paper/cli.py +63 -0
  2. deepresearch_flow/paper/config.py +87 -12
  3. deepresearch_flow/paper/db.py +1041 -34
  4. deepresearch_flow/paper/db_ops.py +124 -19
  5. deepresearch_flow/paper/extract.py +1546 -152
  6. deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +2 -0
  7. deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +5 -0
  8. deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
  9. deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
  10. deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +1 -0
  11. deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +2 -0
  12. deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
  13. deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
  14. deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +2 -0
  15. deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
  16. deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
  17. deepresearch_flow/paper/providers/azure_openai.py +45 -3
  18. deepresearch_flow/paper/providers/openai_compatible.py +45 -3
  19. deepresearch_flow/paper/schemas/deep_read_phi_schema.json +1 -0
  20. deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
  21. deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
  22. deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
  23. deepresearch_flow/paper/snapshot/__init__.py +4 -0
  24. deepresearch_flow/paper/snapshot/api.py +941 -0
  25. deepresearch_flow/paper/snapshot/builder.py +965 -0
  26. deepresearch_flow/paper/snapshot/identity.py +239 -0
  27. deepresearch_flow/paper/snapshot/schema.py +245 -0
  28. deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
  29. deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
  30. deepresearch_flow/paper/snapshot/text.py +154 -0
  31. deepresearch_flow/paper/template_registry.py +1 -0
  32. deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
  33. deepresearch_flow/paper/templates/deep_read_phi.md.j2 +4 -0
  34. deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
  35. deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
  36. deepresearch_flow/paper/web/app.py +10 -3
  37. deepresearch_flow/recognize/cli.py +380 -103
  38. deepresearch_flow/recognize/markdown.py +31 -7
  39. deepresearch_flow/recognize/math.py +47 -12
  40. deepresearch_flow/recognize/mermaid.py +320 -10
  41. deepresearch_flow/recognize/organize.py +29 -7
  42. deepresearch_flow/translator/cli.py +71 -20
  43. deepresearch_flow/translator/engine.py +220 -81
  44. deepresearch_flow/translator/prompts.py +19 -2
  45. deepresearch_flow/translator/protector.py +15 -3
  46. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/METADATA +407 -33
  47. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/RECORD +51 -43
  48. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/WHEEL +1 -1
  49. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/entry_points.txt +0 -0
  50. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/licenses/LICENSE +0 -0
  51. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/top_level.txt +0 -0
@@ -67,9 +67,40 @@ def paper() -> None:
67
67
  @click.option("--split", is_flag=True, help="Write per-document JSON outputs")
68
68
  @click.option("--split-dir", "split_dir", default=None, help="Directory for split outputs")
69
69
  @click.option("--force", is_flag=True, help="Force re-extraction")
70
+ @click.option(
71
+ "--force-stage",
72
+ "force_stages",
73
+ multiple=True,
74
+ help="Force re-run specific stages (multi-stage templates only)",
75
+ )
70
76
  @click.option("--retry-failed", is_flag=True, help="Retry only failed documents")
77
+ @click.option(
78
+ "--retry-failed-stages",
79
+ is_flag=True,
80
+ help="Retry only failed stages per document (multi-stage templates only)",
81
+ )
82
+ @click.option(
83
+ "--retry-list-json",
84
+ "retry_list_json",
85
+ default=None,
86
+ help="Retry only documents listed in a verification report",
87
+ )
88
+ @click.option(
89
+ "--stage-dag",
90
+ is_flag=True,
91
+ help="Enable dependency-aware DAG scheduling (multi-stage templates only)",
92
+ )
93
+ @click.option("--start-idx", "start_idx", type=int, default=0, help="Start index for inputs")
94
+ @click.option(
95
+ "--end-idx",
96
+ "end_idx",
97
+ type=int,
98
+ default=-1,
99
+ help="End index (exclusive); -1 means to the last item",
100
+ )
71
101
  @click.option("--dry-run", is_flag=True, help="Discover inputs without calling providers")
72
102
  @click.option("--max-concurrency", "max_concurrency", type=int, default=None, help="Override max concurrency")
103
+ @click.option("--timeout", "timeout_seconds", type=float, default=None, help="Request timeout in seconds")
73
104
  @click.option("--sleep-every", "sleep_every", type=int, default=None, help="Sleep after every N requests")
74
105
  @click.option("--sleep-time", "sleep_time", type=float, default=None, help="Sleep duration in seconds")
75
106
  @click.option("--render-md", "render_md", is_flag=True, help="Render markdown outputs after extraction")
@@ -116,9 +147,16 @@ def extract(
116
147
  split: bool,
117
148
  split_dir: str | None,
118
149
  force: bool,
150
+ force_stages: tuple[str, ...],
119
151
  retry_failed: bool,
152
+ retry_failed_stages: bool,
153
+ retry_list_json: str | None,
154
+ stage_dag: bool,
155
+ start_idx: int,
156
+ end_idx: int,
120
157
  dry_run: bool,
121
158
  max_concurrency: int | None,
159
+ timeout_seconds: float | None,
122
160
  sleep_every: int | None,
123
161
  sleep_time: float | None,
124
162
  render_md: bool,
@@ -142,14 +180,28 @@ def extract(
142
180
  raise click.ClickException("max_concurrency must be positive")
143
181
  if config.extract.max_retries <= 0:
144
182
  raise click.ClickException("max_retries must be positive")
183
+ if config.extract.timeout <= 0:
184
+ raise click.ClickException("timeout must be positive")
145
185
  if max_concurrency is not None and max_concurrency <= 0:
146
186
  raise click.ClickException("--max-concurrency must be positive")
187
+ if timeout_seconds is not None and timeout_seconds <= 0:
188
+ raise click.ClickException("--timeout must be positive")
147
189
  if sleep_every is not None and sleep_every <= 0:
148
190
  raise click.ClickException("--sleep-every must be positive")
149
191
  if sleep_time is not None and sleep_time <= 0:
150
192
  raise click.ClickException("--sleep-time must be positive")
151
193
  if (sleep_every is None) != (sleep_time is None):
152
194
  raise click.ClickException("Both --sleep-every and --sleep-time are required")
195
+ if start_idx < 0:
196
+ raise click.ClickException("--start-idx must be >= 0")
197
+ if end_idx < -1:
198
+ raise click.ClickException("--end-idx must be -1 or >= 0")
199
+ if retry_failed and retry_failed_stages:
200
+ raise click.ClickException("--retry-failed and --retry-failed-stages are mutually exclusive")
201
+ if retry_list_json and (retry_failed or retry_failed_stages):
202
+ raise click.ClickException(
203
+ "--retry-list-json cannot be combined with --retry-failed or --retry-failed-stages"
204
+ )
153
205
 
154
206
  if provider.type in {
155
207
  "openai_compatible",
@@ -171,6 +223,8 @@ def extract(
171
223
  custom_prompt = bool(prompt_system or prompt_user or template_dir)
172
224
  if custom_prompt and prompt_template != "simple":
173
225
  raise click.ClickException("Custom prompts cannot be combined with built-in prompt templates")
226
+ if stage_dag and custom_prompt:
227
+ raise click.ClickException("--stage-dag requires a built-in multi-stage prompt template")
174
228
 
175
229
  schema_override = schema_path or None
176
230
  prompt_system_path = Path(prompt_system) if prompt_system else None
@@ -240,7 +294,9 @@ def extract(
240
294
 
241
295
  output = Path(output_path or config.extract.output)
242
296
  errors = Path(errors_path or config.extract.errors)
297
+ retry_list_path = Path(retry_list_json) if retry_list_json else None
243
298
  split_out = Path(split_dir) if split_dir else None
299
+ timeout_seconds_effective = timeout_seconds if timeout_seconds is not None else config.extract.timeout
244
300
 
245
301
  configure_logging(verbose)
246
302
 
@@ -258,9 +314,16 @@ def extract(
258
314
  split=split,
259
315
  split_dir=split_out,
260
316
  force=force,
317
+ force_stages=list(force_stages),
261
318
  retry_failed=retry_failed,
319
+ retry_failed_stages=retry_failed_stages,
320
+ retry_list_path=retry_list_path,
321
+ stage_dag=stage_dag or config.extract.stage_dag,
322
+ start_idx=start_idx,
323
+ end_idx=end_idx,
262
324
  dry_run=dry_run,
263
325
  max_concurrency_override=max_concurrency,
326
+ timeout_seconds=timeout_seconds_effective,
264
327
  prompt_template=prompt_template,
265
328
  output_language=output_language,
266
329
  custom_prompt=custom_prompt,
@@ -15,12 +15,15 @@ class ExtractConfig:
15
15
  errors: str
16
16
  max_concurrency: int
17
17
  max_retries: int
18
+ timeout: float
18
19
  backoff_base_seconds: float
19
20
  backoff_max_seconds: float
21
+ pause_threshold_seconds: float
20
22
  truncate_strategy: str
21
23
  truncate_max_chars: int
22
24
  cost_estimate: bool
23
25
  schema_path: str | None
26
+ stage_dag: bool
24
27
 
25
28
 
26
29
  @dataclass(frozen=True)
@@ -28,12 +31,20 @@ class RenderConfig:
28
31
  template_path: str | None
29
32
 
30
33
 
34
+ @dataclass(frozen=True)
35
+ class ApiKeyConfig:
36
+ key: str
37
+ quota_duration: int | None
38
+ reset_time: str | None
39
+ quota_error_tokens: list[str]
40
+
41
+
31
42
  @dataclass(frozen=True)
32
43
  class ProviderConfig:
33
44
  name: str
34
45
  type: str
35
46
  base_url: str
36
- api_keys: list[str]
47
+ api_keys: list[ApiKeyConfig]
37
48
  api_version: str | None
38
49
  deployment: str | None
39
50
  project_id: str | None
@@ -60,12 +71,15 @@ DEFAULT_EXTRACT = ExtractConfig(
60
71
  errors="paper_errors.json",
61
72
  max_concurrency=6,
62
73
  max_retries=3,
74
+ timeout=60.0,
63
75
  backoff_base_seconds=1.0,
64
76
  backoff_max_seconds=20.0,
77
+ pause_threshold_seconds=10.0,
65
78
  truncate_strategy="head_tail",
66
79
  truncate_max_chars=20000,
67
80
  cost_estimate=True,
68
81
  schema_path=None,
82
+ stage_dag=False,
69
83
  )
70
84
 
71
85
  DEFAULT_RENDER = RenderConfig(template_path=None)
@@ -103,6 +117,51 @@ def _as_str(value: Any, default: str | None = None) -> str | None:
103
117
  return str(value)
104
118
 
105
119
 
120
+ def _parse_api_keys(value: Any) -> list[ApiKeyConfig]:
121
+ if value is None:
122
+ return []
123
+ entries = value if isinstance(value, list) else [value]
124
+ parsed: list[ApiKeyConfig] = []
125
+ for entry in entries:
126
+ if isinstance(entry, dict):
127
+ key = _as_str(entry.get("key"))
128
+ if not key:
129
+ raise ValueError("api_keys object entries must include key")
130
+ quota_duration = entry.get("quota_duration")
131
+ quota_duration_value = int(quota_duration) if quota_duration is not None else None
132
+ if quota_duration_value is not None and quota_duration_value <= 0:
133
+ raise ValueError("quota_duration must be positive seconds")
134
+ reset_time = _as_str(entry.get("reset_time"), None)
135
+ tokens = entry.get("quota_error_tokens")
136
+ if tokens is None:
137
+ quota_error_tokens = []
138
+ elif isinstance(tokens, list):
139
+ quota_error_tokens = [str(token) for token in tokens]
140
+ else:
141
+ quota_error_tokens = [str(tokens)]
142
+ parsed.append(
143
+ ApiKeyConfig(
144
+ key=key,
145
+ quota_duration=quota_duration_value,
146
+ reset_time=reset_time,
147
+ quota_error_tokens=quota_error_tokens,
148
+ )
149
+ )
150
+ else:
151
+ key = _as_str(entry)
152
+ if not key:
153
+ continue
154
+ parsed.append(
155
+ ApiKeyConfig(
156
+ key=key,
157
+ quota_duration=None,
158
+ reset_time=None,
159
+ quota_error_tokens=[],
160
+ )
161
+ )
162
+ return parsed
163
+
164
+
106
165
  def _ensure_http_scheme(base_url: str, *, default_scheme: str = "http://") -> str:
107
166
  normalized = base_url.strip()
108
167
  if normalized.startswith(("http://", "https://")):
@@ -125,12 +184,16 @@ def load_config(path: str) -> PaperConfig:
125
184
  errors=_as_str(extract_data.get("errors"), DEFAULT_EXTRACT.errors) or DEFAULT_EXTRACT.errors,
126
185
  max_concurrency=_as_int(extract_data.get("max_concurrency"), DEFAULT_EXTRACT.max_concurrency),
127
186
  max_retries=_as_int(extract_data.get("max_retries"), DEFAULT_EXTRACT.max_retries),
187
+ timeout=_as_float(extract_data.get("timeout"), DEFAULT_EXTRACT.timeout),
128
188
  backoff_base_seconds=_as_float(
129
189
  extract_data.get("backoff_base_seconds"), DEFAULT_EXTRACT.backoff_base_seconds
130
190
  ),
131
191
  backoff_max_seconds=_as_float(
132
192
  extract_data.get("backoff_max_seconds"), DEFAULT_EXTRACT.backoff_max_seconds
133
193
  ),
194
+ pause_threshold_seconds=_as_float(
195
+ extract_data.get("pause_threshold_seconds"), DEFAULT_EXTRACT.pause_threshold_seconds
196
+ ),
134
197
  truncate_strategy=_as_str(
135
198
  extract_data.get("truncate_strategy"), DEFAULT_EXTRACT.truncate_strategy
136
199
  )
@@ -140,6 +203,7 @@ def load_config(path: str) -> PaperConfig:
140
203
  ),
141
204
  cost_estimate=_as_bool(extract_data.get("cost_estimate"), DEFAULT_EXTRACT.cost_estimate),
142
205
  schema_path=_as_str(extract_data.get("schema_path"), DEFAULT_EXTRACT.schema_path),
206
+ stage_dag=_as_bool(extract_data.get("stage_dag"), DEFAULT_EXTRACT.stage_dag),
143
207
  )
144
208
 
145
209
  render_data = data.get("render", {})
@@ -171,10 +235,10 @@ def load_config(path: str) -> PaperConfig:
171
235
  if provider_type == "ollama" and base_url:
172
236
  base_url = _ensure_http_scheme(base_url)
173
237
 
174
- api_keys = _as_list(provider.get("api_keys"))
238
+ api_keys = _parse_api_keys(provider.get("api_keys"))
175
239
  if not api_keys:
176
240
  api_key_single = provider.get("api_key")
177
- api_keys = _as_list(api_key_single)
241
+ api_keys = _parse_api_keys(api_key_single)
178
242
 
179
243
  structured_mode = _as_str(provider.get("structured_mode"), None)
180
244
  if structured_mode is None:
@@ -250,15 +314,26 @@ def load_config(path: str) -> PaperConfig:
250
314
  return PaperConfig(extract=extract, render=render, providers=providers)
251
315
 
252
316
 
253
- def resolve_api_keys(entries: list[str]) -> list[str]:
254
- resolved: list[str] = []
317
+ def resolve_api_key_configs(entries: list[ApiKeyConfig]) -> list[ApiKeyConfig]:
318
+ resolved: list[ApiKeyConfig] = []
255
319
  for entry in entries:
256
- entry = str(entry)
257
- if entry.startswith("env:"):
258
- env_name = entry.split(":", 1)[1]
320
+ key = entry.key
321
+ if key.startswith("env:"):
322
+ env_name = key.split(":", 1)[1]
259
323
  value = os.environ.get(env_name)
260
- if value:
261
- resolved.append(value)
262
- else:
263
- resolved.append(entry)
324
+ if not value:
325
+ continue
326
+ key = value
327
+ resolved.append(
328
+ ApiKeyConfig(
329
+ key=key,
330
+ quota_duration=entry.quota_duration,
331
+ reset_time=entry.reset_time,
332
+ quota_error_tokens=entry.quota_error_tokens,
333
+ )
334
+ )
264
335
  return resolved
336
+
337
+
338
+ def resolve_api_keys(entries: list[ApiKeyConfig]) -> list[str]:
339
+ return [entry.key for entry in resolve_api_key_configs(entries)]