deepresearch-flow 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,259 @@
1
+ """Unpack snapshot to recover original files with readable names.
2
+
3
+ This is the reverse operation of builder.build_snapshot().
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from dataclasses import dataclass
9
+ import hashlib
10
+ import json
11
+ from pathlib import Path
12
+ import re
13
+ import sqlite3
14
+ from typing import Any, Iterable
15
+
16
+ from rich.console import Console
17
+ from rich.table import Table
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class SnapshotUnpackBaseOptions:
22
+ snapshot_db: Path
23
+ static_export_dir: Path
24
+ pdf_roots: list[Path]
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class SnapshotUnpackMdOptions(SnapshotUnpackBaseOptions):
29
+ md_output_dir: Path
30
+ md_translated_output_dir: Path
31
+
32
+
33
+ @dataclass(frozen=True)
34
+ class SnapshotUnpackInfoOptions(SnapshotUnpackBaseOptions):
35
+ template: str
36
+ output_json: Path
37
+
38
+
39
+ @dataclass
40
+ class UnpackCounts:
41
+ total: int = 0
42
+ succeeded: int = 0
43
+ failed: int = 0
44
+ missing_pdf: int = 0
45
+ translated_succeeded: int = 0
46
+ translated_failed: int = 0
47
+
48
+
49
+ def _sanitize_filename(title: str) -> str:
50
+ """Convert title to safe filename."""
51
+ sanitized = re.sub(r'[<>:"/\\|?*]', "_", title)
52
+ if len(sanitized) > 200:
53
+ sanitized = sanitized[:200]
54
+ sanitized = sanitized.strip()
55
+ if not sanitized:
56
+ sanitized = "untitled"
57
+ return sanitized
58
+
59
+
60
+ def _hash_file(path: Path) -> str:
61
+ digest = hashlib.sha256()
62
+ with path.open("rb") as handle:
63
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
64
+ digest.update(chunk)
65
+ return digest.hexdigest()
66
+
67
+
68
+ def _build_pdf_hash_index(pdf_roots: Iterable[Path]) -> dict[str, Path]:
69
+ index: dict[str, Path] = {}
70
+ for root in pdf_roots:
71
+ if root.is_file() and root.suffix.lower() == ".pdf":
72
+ pdf_hash = _hash_file(root)
73
+ index.setdefault(pdf_hash, root)
74
+ continue
75
+ if not root.is_dir():
76
+ continue
77
+ for path in root.rglob("*.pdf"):
78
+ if not path.is_file():
79
+ continue
80
+ pdf_hash = _hash_file(path)
81
+ index.setdefault(pdf_hash, path)
82
+ return index
83
+
84
+
85
+ def _unique_base_name(base: str, paper_id: str, used: set[str]) -> str:
86
+ candidate = base
87
+ if candidate in used:
88
+ candidate = f"{base}_{paper_id}"
89
+ counter = 1
90
+ while candidate in used:
91
+ candidate = f"{base}_{paper_id}_{counter}"
92
+ counter += 1
93
+ used.add(candidate)
94
+ return candidate
95
+
96
+
97
+ def _open_snapshot_db(path: Path) -> sqlite3.Connection:
98
+ conn = sqlite3.connect(path)
99
+ conn.row_factory = sqlite3.Row
100
+ return conn
101
+
102
+
103
+ def _print_summary(title: str, counts: UnpackCounts) -> None:
104
+ table = Table(title=title, header_style="bold cyan", title_style="bold magenta")
105
+ table.add_column("Metric", style="cyan", no_wrap=True)
106
+ table.add_column("Value", style="white", overflow="fold")
107
+ table.add_row("Total", str(counts.total))
108
+ table.add_row("Succeeded", str(counts.succeeded))
109
+ table.add_row("Failed", str(counts.failed))
110
+ table.add_row("Missing PDF", str(counts.missing_pdf))
111
+ if counts.translated_succeeded or counts.translated_failed:
112
+ table.add_row("Translated succeeded", str(counts.translated_succeeded))
113
+ table.add_row("Translated failed", str(counts.translated_failed))
114
+ Console().print(table)
115
+
116
+
117
+ def unpack_md(opts: SnapshotUnpackMdOptions) -> None:
118
+ """Unpack source/translated markdown and align filenames to PDFs."""
119
+ opts.md_output_dir.mkdir(parents=True, exist_ok=True)
120
+ opts.md_translated_output_dir.mkdir(parents=True, exist_ok=True)
121
+
122
+ pdf_index = _build_pdf_hash_index(opts.pdf_roots)
123
+ used_names: set[str] = set()
124
+ counts = UnpackCounts()
125
+
126
+ conn = _open_snapshot_db(opts.snapshot_db)
127
+ try:
128
+ cursor = conn.execute(
129
+ """
130
+ SELECT
131
+ paper_id,
132
+ title,
133
+ source_hash,
134
+ pdf_content_hash,
135
+ source_md_content_hash
136
+ FROM paper
137
+ ORDER BY paper_index, title
138
+ """
139
+ )
140
+ for row in cursor.fetchall():
141
+ counts.total += 1
142
+ paper_id = str(row["paper_id"])
143
+ title = str(row["title"] or "")
144
+ pdf_hash = row["pdf_content_hash"]
145
+ md_hash = row["source_md_content_hash"]
146
+
147
+ base = ""
148
+ if pdf_hash and pdf_hash in pdf_index:
149
+ base = pdf_index[pdf_hash].stem
150
+ else:
151
+ counts.missing_pdf += 1
152
+ base = _sanitize_filename(title)
153
+ base = _unique_base_name(base, paper_id, used_names)
154
+
155
+ if md_hash:
156
+ src_md = opts.static_export_dir / "md" / f"{md_hash}.md"
157
+ if src_md.exists():
158
+ dst_md = opts.md_output_dir / f"{base}.md"
159
+ try:
160
+ dst_md.write_text(src_md.read_text(encoding="utf-8"), encoding="utf-8")
161
+ counts.succeeded += 1
162
+ except OSError:
163
+ counts.failed += 1
164
+ else:
165
+ counts.failed += 1
166
+ else:
167
+ counts.failed += 1
168
+
169
+
170
+ for tr_row in conn.execute(
171
+ "SELECT lang, md_content_hash FROM paper_translation WHERE paper_id = ?",
172
+ (paper_id,),
173
+ ):
174
+ lang = str(tr_row["lang"] or "").lower()
175
+ tr_hash = tr_row["md_content_hash"]
176
+ if not lang or not tr_hash:
177
+ counts.translated_failed += 1
178
+ continue
179
+ src_tr = opts.static_export_dir / "md_translate" / lang / f"{tr_hash}.md"
180
+ if not src_tr.exists():
181
+ counts.translated_failed += 1
182
+ continue
183
+ dst_tr = opts.md_translated_output_dir / f"{base}.{lang}.md"
184
+ try:
185
+ dst_tr.write_text(src_tr.read_text(encoding="utf-8"), encoding="utf-8")
186
+ counts.translated_succeeded += 1
187
+ except OSError:
188
+ counts.translated_failed += 1
189
+ finally:
190
+ conn.close()
191
+
192
+ _print_summary("snapshot unpack md summary", counts)
193
+
194
+
195
+ def unpack_info(opts: SnapshotUnpackInfoOptions) -> None:
196
+ """Unpack aggregated paper_infos.json from snapshot summaries."""
197
+ pdf_index = _build_pdf_hash_index(opts.pdf_roots)
198
+ counts = UnpackCounts()
199
+ items: list[dict[str, Any]] = []
200
+
201
+ conn = _open_snapshot_db(opts.snapshot_db)
202
+ try:
203
+ cursor = conn.execute(
204
+ """
205
+ SELECT
206
+ paper_id,
207
+ title,
208
+ source_hash,
209
+ pdf_content_hash
210
+ FROM paper
211
+ ORDER BY paper_index, title
212
+ """
213
+ )
214
+ for row in cursor.fetchall():
215
+ counts.total += 1
216
+ paper_id = str(row["paper_id"])
217
+ pdf_hash = row["pdf_content_hash"]
218
+ if not (pdf_hash and pdf_hash in pdf_index):
219
+ counts.missing_pdf += 1
220
+
221
+ summary_path = opts.static_export_dir / "summary" / paper_id / f"{opts.template}.json"
222
+ fallback_path = opts.static_export_dir / "summary" / f"{paper_id}.json"
223
+ target_path = summary_path if summary_path.exists() else fallback_path
224
+ used_fallback = target_path == fallback_path
225
+ if not target_path.exists():
226
+ counts.failed += 1
227
+ continue
228
+ try:
229
+ payload = json.loads(target_path.read_text(encoding="utf-8"))
230
+ except json.JSONDecodeError:
231
+ counts.failed += 1
232
+ continue
233
+ if not isinstance(payload, dict):
234
+ counts.failed += 1
235
+ continue
236
+
237
+ base = ""
238
+ if pdf_hash and pdf_hash in pdf_index:
239
+ base = pdf_index[pdf_hash].stem
240
+ else:
241
+ base = _sanitize_filename(str(row["title"] or ""))
242
+ source_path = f"{base}.md" if base else ""
243
+
244
+ payload["paper_id"] = paper_id
245
+ payload["paper_title"] = str(row["title"] or "")
246
+ payload["source_path"] = source_path
247
+ payload["source_hash"] = str(row["source_hash"] or "")
248
+
249
+ if used_fallback:
250
+ counts.failed += 1
251
+ else:
252
+ counts.succeeded += 1
253
+ items.append(payload)
254
+ finally:
255
+ conn.close()
256
+
257
+ opts.output_json.parent.mkdir(parents=True, exist_ok=True)
258
+ opts.output_json.write_text(json.dumps(items, ensure_ascii=False, indent=2), encoding="utf-8")
259
+ _print_summary("snapshot unpack info summary", counts)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deepresearch-flow
3
- Version: 0.6.0
3
+ Version: 0.7.0
4
4
  Summary: Workflow tools for paper extraction, review, and research automation.
5
5
  Author-email: DengQi <dengqi935@gmail.com>
6
6
  License: MIT License
@@ -49,6 +49,7 @@ Requires-Dist: jinja2>=3.1.3
49
49
  Requires-Dist: json-repair>=0.55.1
50
50
  Requires-Dist: jsonschema>=4.26.0
51
51
  Requires-Dist: markdown-it-py>=3.0.0
52
+ Requires-Dist: fastmcp>=3.0.0b1
52
53
  Requires-Dist: mdit-py-plugins>=0.4.0
53
54
  Requires-Dist: pypdf>=6.6.2
54
55
  Requires-Dist: pylatexenc>=2.10
@@ -56,7 +57,7 @@ Requires-Dist: pybtex>=0.24.0
56
57
  Requires-Dist: rich>=14.3.1
57
58
  Requires-Dist: rumdl>=0.1.6
58
59
  Requires-Dist: starlette>=0.52.1
59
- Requires-Dist: tqdm>=4.66.4
60
+ Requires-Dist: tqdm>=4.67.2
60
61
  Requires-Dist: uvicorn>=0.27.1
61
62
  Dynamic: license-file
62
63
 
@@ -400,6 +401,41 @@ uv run deepresearch-flow paper db merge templates \
400
401
 
401
402
  Note: `paper db merge` is now split into `merge library` and `merge templates`.
402
403
 
404
+ ### Merge multiple databases (PDF + Markdown + BibTeX)
405
+
406
+ ```bash
407
+ # 1) Copy PDFs into a single folder
408
+ rsync -av ./pdfs_a/ ./pdfs_merged/
409
+ rsync -av ./pdfs_b/ ./pdfs_merged/
410
+
411
+ # 2) Copy Markdown folders into a single folder
412
+ rsync -av ./md_a/ ./md_merged/
413
+ rsync -av ./md_b/ ./md_merged/
414
+
415
+ # 3) Merge JSON libraries
416
+ uv run deepresearch-flow paper db merge library \
417
+ --inputs ./paper_infos_a.json \
418
+ --inputs ./paper_infos_b.json \
419
+ --output ./paper_infos_merged.json
420
+
421
+ # 4) Merge BibTeX files
422
+ uv run deepresearch-flow paper db merge bibtex \
423
+ -i ./library_a.bib \
424
+ -i ./library_b.bib \
425
+ -o ./library_merged.bib
426
+ ```
427
+
428
+ ### Merge BibTeX files
429
+
430
+ ```bash
431
+ uv run deepresearch-flow paper db merge bibtex \
432
+ -i ./library_a.bib \
433
+ -i ./library_b.bib \
434
+ -o ./library_merged.bib
435
+ ```
436
+
437
+ Duplicate keys keep the entry with the most fields; ties keep the first input order.
438
+
403
439
  ### Recommended: Merge templates then filter by BibTeX
404
440
 
405
441
  ```bash
@@ -4,7 +4,7 @@ deepresearch_flow/cli.py,sha256=t4oowCNWldL0DrVJ4d0UlRkuGU2qHej_G0mAc_quteQ,455
4
4
  deepresearch_flow/paper/__init__.py,sha256=sunaOkcgAJBrfmcaJTumcWbPGVUSGWvOv2a2Yidzy0A,43
5
5
  deepresearch_flow/paper/cli.py,sha256=68d-yccScU0yL6d7eqZVdudPO6i_in8F4v-hKDWILMo,13647
6
6
  deepresearch_flow/paper/config.py,sha256=V7z4ApPXCV1acSl2FU3nZGq6nt8uisMhm0GtOq5zzmg,12021
7
- deepresearch_flow/paper/db.py,sha256=UL2q4CFI33a3DZsZ42VOS_3FtTORnQuAogUfzPVjcO0,86579
7
+ deepresearch_flow/paper/db.py,sha256=RvUN9jeoaEgLNvf8NhWYD-cgIIMZwdZRK3cq17pNWZI,94727
8
8
  deepresearch_flow/paper/db_ops.py,sha256=cb64jn2ax39i3tCS-0DYmlsJdGX3uBS2u5ncUIbUBic,73980
9
9
  deepresearch_flow/paper/extract.py,sha256=78ASAyNLfCl1AsAk2o_v1vskZCNZuayaCHgr0S4V2Vs,87632
10
10
  deepresearch_flow/paper/llm.py,sha256=mHfs5IkT3Q6BOh46MDlfUmgVTX24WRf0IKKoOnN8nV8,4007
@@ -43,11 +43,14 @@ deepresearch_flow/paper/schemas/default_paper_schema.json,sha256=6h_2ayHolJj8JMn
43
43
  deepresearch_flow/paper/schemas/eight_questions_schema.json,sha256=VFKKpdZkgPdQkYIW5jyrZQ7c2TlQZwB4svVWfoiwxdg,1005
44
44
  deepresearch_flow/paper/schemas/three_pass_schema.json,sha256=8aNr4EdRiilxszIRBCC4hRNXrfIOcdnVW4Qhe6Fnh0o,689
45
45
  deepresearch_flow/paper/snapshot/__init__.py,sha256=1VLO36xxDB3J5Yoo-HH9vyI-4ev2HcivXN0sNLg8O5k,102
46
- deepresearch_flow/paper/snapshot/api.py,sha256=WgkOgS7n_2Fx-Bl4KnLrh5nhRJAsWJaPjXu7vX5ubxY,36960
46
+ deepresearch_flow/paper/snapshot/api.py,sha256=F_qehvCjxTBTGj9FmqP4NnJQayUPJm0N5e_8mm5JlDQ,37405
47
47
  deepresearch_flow/paper/snapshot/builder.py,sha256=HbRcfNteMoP4RnQ4y2onZCm9XfnIvzXLn_EwsLZsDzY,38692
48
+ deepresearch_flow/paper/snapshot/common.py,sha256=KAhlGlPgabOCe9Faps8BoDqin71qpkCfaL_ADCr_9vg,917
48
49
  deepresearch_flow/paper/snapshot/identity.py,sha256=k9x1EZPFBU1qgxzkTGvwVtDjLgcosmM_udPuvRLl0uI,7748
50
+ deepresearch_flow/paper/snapshot/mcp_server.py,sha256=lvgbXmuZCZ_zaQMdZEMjN-OChHPdoZ9MmuuQ-7ORias,22901
49
51
  deepresearch_flow/paper/snapshot/schema.py,sha256=DcVmAklLYyEeDoVV9jYw7hoMHnHd9Eziivl-LP2busY,8991
50
52
  deepresearch_flow/paper/snapshot/text.py,sha256=0RnxLowa6AdirdLsUYym6BhWbjwiP2Qj2oZeA-pjmdE,4368
53
+ deepresearch_flow/paper/snapshot/unpacker.py,sha256=ScKSFdrQLJHrITHe9KAxgAEH-vAAnXLolvW9zeJ3wsc,8575
51
54
  deepresearch_flow/paper/snapshot/tests/__init__.py,sha256=G0IowrxHjGUIaqxcw6SvlcLFAtE5ZsleG6ECgd-sIdk,52
52
55
  deepresearch_flow/paper/snapshot/tests/test_identity.py,sha256=KDFixAUU9l68KOum7gf1IrD0Oy18dBCSXG7RbJTqflA,4520
53
56
  deepresearch_flow/paper/templates/__init__.py,sha256=p8W6kINvrf-T2X6Ow4GMr28syVOorFuMn0pbmieVzAw,35
@@ -463,9 +466,9 @@ deepresearch_flow/translator/placeholder.py,sha256=mEgqA-dPdOsIhno0h_hzfpXpY2asb
463
466
  deepresearch_flow/translator/prompts.py,sha256=EvfBvBIpQXARDj4m87GAyFXJGL8EJeahj_rOmp9mv68,5556
464
467
  deepresearch_flow/translator/protector.py,sha256=yUMuS2FgVofK_MRXrcauLRiwNvdCCjNAnh6CcNd686o,11777
465
468
  deepresearch_flow/translator/segment.py,sha256=rBFMCLTrvm2GrPc_hNFymi-8Ih2DAtUQlZHCRE9nLaM,5146
466
- deepresearch_flow-0.6.0.dist-info/licenses/LICENSE,sha256=hT8F2Py1pe6flxq3Ufdm2UKFk0B8CBm0aAQfsLXfvjw,1063
467
- deepresearch_flow-0.6.0.dist-info/METADATA,sha256=fyynvn8LYDTZlsIaKDr3SxQbR8nqQSOk3s85ZIh1t6E,25838
468
- deepresearch_flow-0.6.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
469
- deepresearch_flow-0.6.0.dist-info/entry_points.txt,sha256=1uIKscs0YRMg_mFsg9NjsaTt4CvQqQ_-zGERUKhhL_Y,65
470
- deepresearch_flow-0.6.0.dist-info/top_level.txt,sha256=qBl4RvPJNJUbL8CFfMNWxY0HpQLx5RlF_ko-z_aKpm0,18
471
- deepresearch_flow-0.6.0.dist-info/RECORD,,
469
+ deepresearch_flow-0.7.0.dist-info/licenses/LICENSE,sha256=hT8F2Py1pe6flxq3Ufdm2UKFk0B8CBm0aAQfsLXfvjw,1063
470
+ deepresearch_flow-0.7.0.dist-info/METADATA,sha256=aluWW1CXPeSWCLKopChdbgl_GHEQHByua1fBohr6Mzg,26728
471
+ deepresearch_flow-0.7.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
472
+ deepresearch_flow-0.7.0.dist-info/entry_points.txt,sha256=1uIKscs0YRMg_mFsg9NjsaTt4CvQqQ_-zGERUKhhL_Y,65
473
+ deepresearch_flow-0.7.0.dist-info/top_level.txt,sha256=qBl4RvPJNJUbL8CFfMNWxY0HpQLx5RlF_ko-z_aKpm0,18
474
+ deepresearch_flow-0.7.0.dist-info/RECORD,,