PyPI - deepresearch-flow - Versions diffs - 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

deepresearch-flow 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

deepresearch_flow/paper/snapshot/unpacker.py ADDED Viewed

@@ -0,0 +1,259 @@
+"""Unpack snapshot to recover original files with readable names.
+This is the reverse operation of builder.build_snapshot().
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+import hashlib
+import json
+from pathlib import Path
+import re
+import sqlite3
+from typing import Any, Iterable
+from rich.console import Console
+from rich.table import Table
+@dataclass(frozen=True)
+class SnapshotUnpackBaseOptions:
+    snapshot_db: Path
+    static_export_dir: Path
+    pdf_roots: list[Path]
+@dataclass(frozen=True)
+class SnapshotUnpackMdOptions(SnapshotUnpackBaseOptions):
+    md_output_dir: Path
+    md_translated_output_dir: Path
+@dataclass(frozen=True)
+class SnapshotUnpackInfoOptions(SnapshotUnpackBaseOptions):
+    template: str
+    output_json: Path
+@dataclass
+class UnpackCounts:
+    total: int = 0
+    succeeded: int = 0
+    failed: int = 0
+    missing_pdf: int = 0
+    translated_succeeded: int = 0
+    translated_failed: int = 0
+def _sanitize_filename(title: str) -> str:
+    """Convert title to safe filename."""
+    sanitized = re.sub(r'[<>:"/\\|?*]', "_", title)
+    if len(sanitized) > 200:
+        sanitized = sanitized[:200]
+    sanitized = sanitized.strip()
+    if not sanitized:
+        sanitized = "untitled"
+    return sanitized
+def _hash_file(path: Path) -> str:
+    digest = hashlib.sha256()
+    with path.open("rb") as handle:
+        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+            digest.update(chunk)
+    return digest.hexdigest()
+def _build_pdf_hash_index(pdf_roots: Iterable[Path]) -> dict[str, Path]:
+    index: dict[str, Path] = {}
+    for root in pdf_roots:
+        if root.is_file() and root.suffix.lower() == ".pdf":
+            pdf_hash = _hash_file(root)
+            index.setdefault(pdf_hash, root)
+            continue
+        if not root.is_dir():
+            continue
+        for path in root.rglob("*.pdf"):
+            if not path.is_file():
+                continue
+            pdf_hash = _hash_file(path)
+            index.setdefault(pdf_hash, path)
+    return index
+def _unique_base_name(base: str, paper_id: str, used: set[str]) -> str:
+    candidate = base
+    if candidate in used:
+        candidate = f"{base}_{paper_id}"
+    counter = 1
+    while candidate in used:
+        candidate = f"{base}_{paper_id}_{counter}"
+        counter += 1
+    used.add(candidate)
+    return candidate
+def _open_snapshot_db(path: Path) -> sqlite3.Connection:
+    conn = sqlite3.connect(path)
+    conn.row_factory = sqlite3.Row
+    return conn
+def _print_summary(title: str, counts: UnpackCounts) -> None:
+    table = Table(title=title, header_style="bold cyan", title_style="bold magenta")
+    table.add_column("Metric", style="cyan", no_wrap=True)
+    table.add_column("Value", style="white", overflow="fold")
+    table.add_row("Total", str(counts.total))
+    table.add_row("Succeeded", str(counts.succeeded))
+    table.add_row("Failed", str(counts.failed))
+    table.add_row("Missing PDF", str(counts.missing_pdf))
+    if counts.translated_succeeded or counts.translated_failed:
+        table.add_row("Translated succeeded", str(counts.translated_succeeded))
+        table.add_row("Translated failed", str(counts.translated_failed))
+    Console().print(table)
+def unpack_md(opts: SnapshotUnpackMdOptions) -> None:
+    """Unpack source/translated markdown and align filenames to PDFs."""
+    opts.md_output_dir.mkdir(parents=True, exist_ok=True)
+    opts.md_translated_output_dir.mkdir(parents=True, exist_ok=True)
+    pdf_index = _build_pdf_hash_index(opts.pdf_roots)
+    used_names: set[str] = set()
+    counts = UnpackCounts()
+    conn = _open_snapshot_db(opts.snapshot_db)
+    try:
+        cursor = conn.execute(
+            """
+            SELECT
+                paper_id,
+                title,
+                source_hash,
+                pdf_content_hash,
+                source_md_content_hash
+            FROM paper
+            ORDER BY paper_index, title
+            """
+        )
+        for row in cursor.fetchall():
+            counts.total += 1
+            paper_id = str(row["paper_id"])
+            title = str(row["title"] or "")
+            pdf_hash = row["pdf_content_hash"]
+            md_hash = row["source_md_content_hash"]
+            base = ""
+            if pdf_hash and pdf_hash in pdf_index:
+                base = pdf_index[pdf_hash].stem
+            else:
+                counts.missing_pdf += 1
+                base = _sanitize_filename(title)
+            base = _unique_base_name(base, paper_id, used_names)
+            if md_hash:
+                src_md = opts.static_export_dir / "md" / f"{md_hash}.md"
+                if src_md.exists():
+                    dst_md = opts.md_output_dir / f"{base}.md"
+                    try:
+                        dst_md.write_text(src_md.read_text(encoding="utf-8"), encoding="utf-8")
+                        counts.succeeded += 1
+                    except OSError:
+                        counts.failed += 1
+                else:
+                    counts.failed += 1
+            else:
+                counts.failed += 1
+            for tr_row in conn.execute(
+                "SELECT lang, md_content_hash FROM paper_translation WHERE paper_id = ?",
+                (paper_id,),
+            ):
+                lang = str(tr_row["lang"] or "").lower()
+                tr_hash = tr_row["md_content_hash"]
+                if not lang or not tr_hash:
+                    counts.translated_failed += 1
+                    continue
+                src_tr = opts.static_export_dir / "md_translate" / lang / f"{tr_hash}.md"
+                if not src_tr.exists():
+                    counts.translated_failed += 1
+                    continue
+                dst_tr = opts.md_translated_output_dir / f"{base}.{lang}.md"
+                try:
+                    dst_tr.write_text(src_tr.read_text(encoding="utf-8"), encoding="utf-8")
+                    counts.translated_succeeded += 1
+                except OSError:
+                    counts.translated_failed += 1
+    finally:
+        conn.close()
+    _print_summary("snapshot unpack md summary", counts)
+def unpack_info(opts: SnapshotUnpackInfoOptions) -> None:
+    """Unpack aggregated paper_infos.json from snapshot summaries."""
+    pdf_index = _build_pdf_hash_index(opts.pdf_roots)
+    counts = UnpackCounts()
+    items: list[dict[str, Any]] = []
+    conn = _open_snapshot_db(opts.snapshot_db)
+    try:
+        cursor = conn.execute(
+            """
+            SELECT
+                paper_id,
+                title,
+                source_hash,
+                pdf_content_hash
+            FROM paper
+            ORDER BY paper_index, title
+            """
+        )
+        for row in cursor.fetchall():
+            counts.total += 1
+            paper_id = str(row["paper_id"])
+            pdf_hash = row["pdf_content_hash"]
+            if not (pdf_hash and pdf_hash in pdf_index):
+                counts.missing_pdf += 1
+            summary_path = opts.static_export_dir / "summary" / paper_id / f"{opts.template}.json"
+            fallback_path = opts.static_export_dir / "summary" / f"{paper_id}.json"
+            target_path = summary_path if summary_path.exists() else fallback_path
+            used_fallback = target_path == fallback_path
+            if not target_path.exists():
+                counts.failed += 1
+                continue
+            try:
+                payload = json.loads(target_path.read_text(encoding="utf-8"))
+            except json.JSONDecodeError:
+                counts.failed += 1
+                continue
+            if not isinstance(payload, dict):
+                counts.failed += 1
+                continue
+            base = ""
+            if pdf_hash and pdf_hash in pdf_index:
+                base = pdf_index[pdf_hash].stem
+            else:
+                base = _sanitize_filename(str(row["title"] or ""))
+            source_path = f"{base}.md" if base else ""
+            payload["paper_id"] = paper_id
+            payload["paper_title"] = str(row["title"] or "")
+            payload["source_path"] = source_path
+            payload["source_hash"] = str(row["source_hash"] or "")
+            if used_fallback:
+                counts.failed += 1
+            else:
+                counts.succeeded += 1
+            items.append(payload)
+    finally:
+        conn.close()
+    opts.output_json.parent.mkdir(parents=True, exist_ok=True)
+    opts.output_json.write_text(json.dumps(items, ensure_ascii=False, indent=2), encoding="utf-8")
+    _print_summary("snapshot unpack info summary", counts)

{deepresearch_flow-0.6.0.dist-info → deepresearch_flow-0.7.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deepresearch-flow
-Version: 0.6.0
+Version: 0.7.0
 Summary: Workflow tools for paper extraction, review, and research automation.
 Author-email: DengQi <dengqi935@gmail.com>
 License: MIT License
@@ -49,6 +49,7 @@ Requires-Dist: jinja2>=3.1.3
 Requires-Dist: json-repair>=0.55.1
 Requires-Dist: jsonschema>=4.26.0
 Requires-Dist: markdown-it-py>=3.0.0
+Requires-Dist: fastmcp>=3.0.0b1
 Requires-Dist: mdit-py-plugins>=0.4.0
 Requires-Dist: pypdf>=6.6.2
 Requires-Dist: pylatexenc>=2.10
@@ -56,7 +57,7 @@ Requires-Dist: pybtex>=0.24.0
 Requires-Dist: rich>=14.3.1
 Requires-Dist: rumdl>=0.1.6
 Requires-Dist: starlette>=0.52.1
-Requires-Dist: tqdm>=4.66.4
+Requires-Dist: tqdm>=4.67.2
 Requires-Dist: uvicorn>=0.27.1
 Dynamic: license-file
@@ -400,6 +401,41 @@ uv run deepresearch-flow paper db merge templates \
 Note: `paper db merge` is now split into `merge library` and `merge templates`.
+### Merge multiple databases (PDF + Markdown + BibTeX)
+```bash
+# 1) Copy PDFs into a single folder
+rsync -av ./pdfs_a/ ./pdfs_merged/
+rsync -av ./pdfs_b/ ./pdfs_merged/
+# 2) Copy Markdown folders into a single folder
+rsync -av ./md_a/ ./md_merged/
+rsync -av ./md_b/ ./md_merged/
+# 3) Merge JSON libraries
+uv run deepresearch-flow paper db merge library \
+  --inputs ./paper_infos_a.json \
+  --inputs ./paper_infos_b.json \
+  --output ./paper_infos_merged.json
+# 4) Merge BibTeX files
+uv run deepresearch-flow paper db merge bibtex \
+  -i ./library_a.bib \
+  -i ./library_b.bib \
+  -o ./library_merged.bib
+```
+### Merge BibTeX files
+```bash
+uv run deepresearch-flow paper db merge bibtex \
+  -i ./library_a.bib \
+  -i ./library_b.bib \
+  -o ./library_merged.bib
+```
+Duplicate keys keep the entry with the most fields; ties keep the first input order.
 ### Recommended: Merge templates then filter by BibTeX
 ```bash

{deepresearch_flow-0.6.0.dist-info → deepresearch_flow-0.7.0.dist-info}/RECORD RENAMED Viewed

@@ -4,7 +4,7 @@ deepresearch_flow/cli.py,sha256=t4oowCNWldL0DrVJ4d0UlRkuGU2qHej_G0mAc_quteQ,455
 deepresearch_flow/paper/__init__.py,sha256=sunaOkcgAJBrfmcaJTumcWbPGVUSGWvOv2a2Yidzy0A,43
 deepresearch_flow/paper/cli.py,sha256=68d-yccScU0yL6d7eqZVdudPO6i_in8F4v-hKDWILMo,13647
 deepresearch_flow/paper/config.py,sha256=V7z4ApPXCV1acSl2FU3nZGq6nt8uisMhm0GtOq5zzmg,12021
-deepresearch_flow/paper/db.py,sha256=UL2q4CFI33a3DZsZ42VOS_3FtTORnQuAogUfzPVjcO0,86579
+deepresearch_flow/paper/db.py,sha256=RvUN9jeoaEgLNvf8NhWYD-cgIIMZwdZRK3cq17pNWZI,94727
 deepresearch_flow/paper/db_ops.py,sha256=cb64jn2ax39i3tCS-0DYmlsJdGX3uBS2u5ncUIbUBic,73980
 deepresearch_flow/paper/extract.py,sha256=78ASAyNLfCl1AsAk2o_v1vskZCNZuayaCHgr0S4V2Vs,87632
 deepresearch_flow/paper/llm.py,sha256=mHfs5IkT3Q6BOh46MDlfUmgVTX24WRf0IKKoOnN8nV8,4007
@@ -43,11 +43,14 @@ deepresearch_flow/paper/schemas/default_paper_schema.json,sha256=6h_2ayHolJj8JMn
 deepresearch_flow/paper/schemas/eight_questions_schema.json,sha256=VFKKpdZkgPdQkYIW5jyrZQ7c2TlQZwB4svVWfoiwxdg,1005
 deepresearch_flow/paper/schemas/three_pass_schema.json,sha256=8aNr4EdRiilxszIRBCC4hRNXrfIOcdnVW4Qhe6Fnh0o,689
 deepresearch_flow/paper/snapshot/__init__.py,sha256=1VLO36xxDB3J5Yoo-HH9vyI-4ev2HcivXN0sNLg8O5k,102
-deepresearch_flow/paper/snapshot/api.py,sha256=WgkOgS7n_2Fx-Bl4KnLrh5nhRJAsWJaPjXu7vX5ubxY,36960
+deepresearch_flow/paper/snapshot/api.py,sha256=F_qehvCjxTBTGj9FmqP4NnJQayUPJm0N5e_8mm5JlDQ,37405
 deepresearch_flow/paper/snapshot/builder.py,sha256=HbRcfNteMoP4RnQ4y2onZCm9XfnIvzXLn_EwsLZsDzY,38692
+deepresearch_flow/paper/snapshot/common.py,sha256=KAhlGlPgabOCe9Faps8BoDqin71qpkCfaL_ADCr_9vg,917
 deepresearch_flow/paper/snapshot/identity.py,sha256=k9x1EZPFBU1qgxzkTGvwVtDjLgcosmM_udPuvRLl0uI,7748
+deepresearch_flow/paper/snapshot/mcp_server.py,sha256=lvgbXmuZCZ_zaQMdZEMjN-OChHPdoZ9MmuuQ-7ORias,22901
 deepresearch_flow/paper/snapshot/schema.py,sha256=DcVmAklLYyEeDoVV9jYw7hoMHnHd9Eziivl-LP2busY,8991
 deepresearch_flow/paper/snapshot/text.py,sha256=0RnxLowa6AdirdLsUYym6BhWbjwiP2Qj2oZeA-pjmdE,4368
+deepresearch_flow/paper/snapshot/unpacker.py,sha256=ScKSFdrQLJHrITHe9KAxgAEH-vAAnXLolvW9zeJ3wsc,8575
 deepresearch_flow/paper/snapshot/tests/__init__.py,sha256=G0IowrxHjGUIaqxcw6SvlcLFAtE5ZsleG6ECgd-sIdk,52
 deepresearch_flow/paper/snapshot/tests/test_identity.py,sha256=KDFixAUU9l68KOum7gf1IrD0Oy18dBCSXG7RbJTqflA,4520
 deepresearch_flow/paper/templates/__init__.py,sha256=p8W6kINvrf-T2X6Ow4GMr28syVOorFuMn0pbmieVzAw,35
@@ -463,9 +466,9 @@ deepresearch_flow/translator/placeholder.py,sha256=mEgqA-dPdOsIhno0h_hzfpXpY2asb
 deepresearch_flow/translator/prompts.py,sha256=EvfBvBIpQXARDj4m87GAyFXJGL8EJeahj_rOmp9mv68,5556
 deepresearch_flow/translator/protector.py,sha256=yUMuS2FgVofK_MRXrcauLRiwNvdCCjNAnh6CcNd686o,11777
 deepresearch_flow/translator/segment.py,sha256=rBFMCLTrvm2GrPc_hNFymi-8Ih2DAtUQlZHCRE9nLaM,5146
-deepresearch_flow-0.6.0.dist-info/licenses/LICENSE,sha256=hT8F2Py1pe6flxq3Ufdm2UKFk0B8CBm0aAQfsLXfvjw,1063
-deepresearch_flow-0.6.0.dist-info/METADATA,sha256=fyynvn8LYDTZlsIaKDr3SxQbR8nqQSOk3s85ZIh1t6E,25838
-deepresearch_flow-0.6.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-deepresearch_flow-0.6.0.dist-info/entry_points.txt,sha256=1uIKscs0YRMg_mFsg9NjsaTt4CvQqQ_-zGERUKhhL_Y,65
-deepresearch_flow-0.6.0.dist-info/top_level.txt,sha256=qBl4RvPJNJUbL8CFfMNWxY0HpQLx5RlF_ko-z_aKpm0,18
-deepresearch_flow-0.6.0.dist-info/RECORD,,
+deepresearch_flow-0.7.0.dist-info/licenses/LICENSE,sha256=hT8F2Py1pe6flxq3Ufdm2UKFk0B8CBm0aAQfsLXfvjw,1063
+deepresearch_flow-0.7.0.dist-info/METADATA,sha256=aluWW1CXPeSWCLKopChdbgl_GHEQHByua1fBohr6Mzg,26728
+deepresearch_flow-0.7.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+deepresearch_flow-0.7.0.dist-info/entry_points.txt,sha256=1uIKscs0YRMg_mFsg9NjsaTt4CvQqQ_-zGERUKhhL_Y,65
+deepresearch_flow-0.7.0.dist-info/top_level.txt,sha256=qBl4RvPJNJUbL8CFfMNWxY0HpQLx5RlF_ko-z_aKpm0,18
+deepresearch_flow-0.7.0.dist-info/RECORD,,

{deepresearch_flow-0.6.0.dist-info → deepresearch_flow-0.7.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{deepresearch_flow-0.6.0.dist-info → deepresearch_flow-0.7.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{deepresearch_flow-0.6.0.dist-info → deepresearch_flow-0.7.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{deepresearch_flow-0.6.0.dist-info → deepresearch_flow-0.7.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

deepresearch-flow 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

deepresearch-flow 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl