flowapy 0.1.3__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {flowapy-0.1.3 → flowapy-0.2.0}/PKG-INFO +2 -1
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/README.md +11 -9
- flowapy-0.2.0/examples/demo/fixtures/papers/10.1002%2Fhumu.23878/pdf_index.pkl.zst +0 -0
- flowapy-0.2.0/examples/demo/fixtures/papers/10.1016%2Fj.ymgmr.2024.101163/pdf_index.pkl.zst +0 -0
- flowapy-0.2.0/examples/demo/fixtures/papers/10.1038%2Fs41598-022-25914-8/pdf_index.pkl.zst +0 -0
- flowapy-0.2.0/examples/demo/fixtures/papers/10.1186%2Fs12881-019-0878-8/pdf_index.pkl.zst +0 -0
- flowapy-0.2.0/examples/demo/fixtures/papers/10.1186%2Fs13023-021-01817-1/pdf_index.pkl.zst +0 -0
- flowapy-0.2.0/examples/demo/fixtures/papers/10.1186%2Fs13023-021-02146-z/pdf_index.pkl.zst +0 -0
- flowapy-0.2.0/examples/demo/fixtures/papers/10.1186%2Fs13023-023-02848-6/pdf_index.pkl.zst +0 -0
- flowapy-0.2.0/examples/demo/fixtures/papers/10.1186%2Fs13052-019-0692-0/pdf_index.pkl.zst +0 -0
- flowapy-0.2.0/examples/demo/fixtures/papers/10.3389%2Ffcvm.2022.1061384/pdf_index.pkl.zst +0 -0
- flowapy-0.2.0/examples/demo/fixtures/papers/10.3389%2Ffcvm.2023.1261172/pdf_index.pkl.zst +0 -0
- flowapy-0.2.0/examples/demo/fixtures/papers/10.3389%2Ffimmu.2024.1336599/pdf_index.pkl.zst +0 -0
- flowapy-0.2.0/examples/demo/fixtures/papers/10.3389%2Ffped.2021.729824/pdf_index.pkl.zst +0 -0
- flowapy-0.2.0/examples/demo/fixtures/papers/10.3389%2Ffphar.2022.903488/pdf_index.pkl.zst +0 -0
- flowapy-0.2.0/examples/demo/fixtures/papers/10.3390%2Fijns11010016/pdf_index.pkl.zst +0 -0
- flowapy-0.2.0/examples/demo/fixtures/papers/10.3390%2Fijns6020031/pdf_index.pkl.zst +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/src/demo_gateway/main.py +13 -18
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/tests/test_resolve.py +13 -28
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/uv.lock +28 -1
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/package.json +2 -2
- {flowapy-0.1.3 → flowapy-0.2.0}/pyproject.toml +2 -1
- {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/aggregate.py +11 -21
- {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/convert.py +45 -20
- flowapy-0.2.0/src/flowa/pdf_index_cache.py +133 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/resolve.py +54 -69
- flowapy-0.2.0/tests/test_pdf_index_cache.py +120 -0
- flowapy-0.2.0/tests/test_resolve.py +158 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/uv.lock +28 -1
- flowapy-0.1.3/tests/test_resolve.py +0 -207
- {flowapy-0.1.3 → flowapy-0.2.0}/.env.example +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/.github/dependabot.yml +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/.github/workflows/dependabot-auto-merge.yml +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/.github/workflows/lint.yaml +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/.github/workflows/release-chat-service.yaml +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/.github/workflows/release-flowapy.yaml +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/.github/workflows/release-react-viewer.yaml +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/.gitignore +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/.markdownlint.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/.nvmrc +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/.pre-commit-config.yaml +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/.prettierignore +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/Dockerfile +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/LICENSE +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/README.md +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/docs/images/viewer.png +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/.gitkeep +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/.env.example +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/.gitignore +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/LICENSES.md +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/aggregation.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.1002%2Fhumu.23878.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.1016%2Fj.ymgmr.2024.101163.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.1038%2Fs41598-022-25914-8.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.1186%2Fs12881-019-0878-8.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.1186%2Fs13023-021-01817-1.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.1186%2Fs13023-021-02146-z.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.1186%2Fs13023-023-02848-6.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.1186%2Fs13052-019-0692-0.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.3389%2Ffcvm.2022.1061384.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.3389%2Ffcvm.2023.1261172.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.3389%2Ffimmu.2024.1336599.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.3389%2Ffped.2021.729824.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.3389%2Ffphar.2022.903488.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.3390%2Fijns11010016.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.3390%2Fijns6020031.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/query.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/runs/cfc0186a7b7e46eb802a516b86ec207f/progress.jsonl +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/variant_details.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1002%2Fajmg.a.61481/metadata.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1002%2Fhumu.23878/markdown.md +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1002%2Fhumu.23878/metadata.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1002%2Fhumu.23878/source.pdf +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1016%2Fj.ejmg.2020.103997/metadata.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1016%2Fj.nmd.2022.02.002/metadata.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1016%2Fj.tjog.2022.07.008/metadata.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1016%2Fj.ymgmr.2024.101163/markdown.md +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1016%2Fj.ymgmr.2024.101163/metadata.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1016%2Fj.ymgmr.2024.101163/source.pdf +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1038%2Fs41598-022-25914-8/markdown.md +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1038%2Fs41598-022-25914-8/metadata.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1038%2Fs41598-022-25914-8/source.pdf +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1093%2Fhmg%2Fddz218/metadata.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1136%2Fjmg-2022-108675/metadata.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs12881-019-0878-8/markdown.md +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs12881-019-0878-8/metadata.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs12881-019-0878-8/source.pdf +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13023-021-01817-1/markdown.md +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13023-021-01817-1/metadata.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13023-021-01817-1/source.pdf +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13023-021-02146-z/markdown.md +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13023-021-02146-z/metadata.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13023-021-02146-z/source.pdf +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13023-023-02848-6/markdown.md +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13023-023-02848-6/metadata.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13023-023-02848-6/source.pdf +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13052-019-0692-0/markdown.md +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13052-019-0692-0/metadata.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13052-019-0692-0/source.pdf +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffcvm.2022.1061384/markdown.md +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffcvm.2022.1061384/metadata.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffcvm.2022.1061384/source.pdf +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffcvm.2023.1261172/markdown.md +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffcvm.2023.1261172/metadata.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffcvm.2023.1261172/source.pdf +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffimmu.2024.1336599/markdown.md +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffimmu.2024.1336599/metadata.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffimmu.2024.1336599/source.pdf +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffped.2021.729824/markdown.md +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffped.2021.729824/metadata.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffped.2021.729824/source.pdf +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffphar.2022.903488/markdown.md +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffphar.2022.903488/metadata.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffphar.2022.903488/source.pdf +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3390%2Fijns11010016/markdown.md +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3390%2Fijns11010016/metadata.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3390%2Fijns11010016/source.pdf +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3390%2Fijns6020031/markdown.md +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3390%2Fijns6020031/metadata.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3390%2Fijns6020031/source.pdf +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/next-env.d.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/next.config.mjs +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/package.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/postcss.config.cjs +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/public/favicon.svg +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/scripts/chat-service.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/scripts/copy-pdfjs-assets.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/scripts/exercise-llm.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/scripts/start.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/components/literature/LiteratureView.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/components/literature/PaperStatusGroup.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/components/literature/ProgressLog.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/components/literature/matchFilename.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/db/migrate.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/db/schema.sql +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/lib/aggregate.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/lib/chatSessionClient.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/lib/citationResolverClient.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/lib/demoConfig.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/lib/papers.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/lib/progressEvents.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/lib/runs.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/lib/triageBackendClient.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/lib/triageDb.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/lib/variantId.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/_app.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/aggregate/[variantId]/[category].ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/edit-drafts/[variantId]/[category]/[version].ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/edit-drafts/[variantId]/[category]/index.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/papers/[doi]/pdf.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/papers/index.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/runs/[variantId]/[runId]/progress.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/runs/index.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/runs/latest.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/triage/claim.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/triage/comment.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/triage/paper-done.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/triage/snapshot/[variantId]/[category]/[version].ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/index.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/variants/[variantId].tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/viewer/[variantId]/[category].tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/styles/globals.css +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/tailwind.config.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/LiteratureView.test.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/ProgressLog.test.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/aggregate.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/chat-service.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/index-page.test.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/matchFilename.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/papers-pdf-upload.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/papers-route.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/papers.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/progress-route.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/runs-latest-route.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/runs-route.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/runs.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/setup.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/triage.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/variantId.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/tsconfig.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/vitest.config.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/README.md +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/pyproject.toml +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/src/demo_gateway/__init__.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/src/demo_gateway/config.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/src/demo_gateway/progress.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/src/demo_gateway/runs.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/tests/__init__.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/tests/conftest.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/tests/test_main.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/tests/test_progress.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/tests/test_runs.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/package.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/.gitkeep +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/Dockerfile +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/LICENSE +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/README.md +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/artifact.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/audit.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/auth/jwt.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/auth/oidc.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/chat.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/cli.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/config.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/index.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/instrumentation.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/llm/anthropic.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/llm/bedrock.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/llm/factory.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/llm/google-gla.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/llm/google-vertex.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/llm/interface.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/llm/openai.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/prompts.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/server.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/session.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/storage/factory.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/storage/fs.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/storage/gcs.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/storage/interface.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/storage/s3.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/storage-keys.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/telemetry.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/text.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/yaml.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/test/chat.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/test/generic-prompt.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/test/llm-factory.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/test/oidc.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/test/paper-cache.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/test/storage-fs.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/test/storage-gcs.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/test/storage-s3.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/test/text.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/test/yaml.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/tsconfig.build.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/tsconfig.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/vitest.config.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/LICENSE +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/README.md +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/package.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/citations/sanitize.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/citations/sanitize.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/citations/types.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/index.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/llm-content/LlmContent.test.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/llm-content/LlmContent.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/pdf-viewer/PdfHighlightViewer.test.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/pdf-viewer/PdfHighlightViewer.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/pdf-viewer/types.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/styles.css +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/ChatDrawer.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/ChatSection.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/ClaimList.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/EvidenceViewerShell.test.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/EvidenceViewerShell.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/FocusCard.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/PaperHeader.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/PaperRail.test.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/PaperRail.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/SynthesisPanel.tsx +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/backend.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/citation-resolver.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/citation-utils.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/citation-utils.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/claim-refs.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/claim-refs.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/keyboard.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/keyboard.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/store.test.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/store.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/types.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/tailwind.config.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/tsconfig.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/tsup.config.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/vitest.config.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/pnpm-lock.yaml +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/pnpm-workspace.yaml +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/prompts/generic/aggregation_edit_prompt.txt +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/prompts/generic/aggregation_edit_schema.ts +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/prompts/generic/aggregation_prompt.txt +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/prompts/generic/aggregation_schema.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/prompts/generic/extraction_prompt.txt +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/prompts/generic/extraction_schema.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/prompts/generic/transcription_prompt.txt +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/prompts/package.json +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/__init__.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/artifact.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/cli.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/clinvar.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/download.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/extract.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/http_retry.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/models.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/normalize.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/progress.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/prompts/__init__.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/py.typed +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/query.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/run.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/schema.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/settings.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/storage.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/tests/__init__.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/tests/test_progress.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/tests/test_prompts.py +0 -0
- {flowapy-0.1.3 → flowapy-0.2.0}/tsconfig.base.json +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: flowapy
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Variant literature assessment pipeline with AI extraction
|
|
5
5
|
Project-URL: Homepage, https://github.com/populationgenomics/flowa
|
|
6
6
|
Project-URL: Source, https://github.com/populationgenomics/flowa
|
|
@@ -47,6 +47,7 @@ Requires-Dist: pypdf
|
|
|
47
47
|
Requires-Dist: s3fs
|
|
48
48
|
Requires-Dist: tenacity
|
|
49
49
|
Requires-Dist: typer
|
|
50
|
+
Requires-Dist: zstandard
|
|
50
51
|
Provides-Extra: anthropic
|
|
51
52
|
Requires-Dist: pydantic-ai-slim[anthropic]==1.101.0; extra == 'anthropic'
|
|
52
53
|
Provides-Extra: bedrock
|
|
@@ -166,10 +166,10 @@ VARIANT=NM_001035_3-c_14174A_G
|
|
|
166
166
|
rm -f assessments/$VARIANT/aggregation.json \
|
|
167
167
|
assessments/$VARIANT/aggregation_raw.json
|
|
168
168
|
rm -rf assessments/$VARIANT/extractions/ assessments/$VARIANT/runs/
|
|
169
|
-
# Re-runs flowa.convert (which uses anchorite for PDF chunking
|
|
170
|
-
# Drop this line to reuse the cached markdown
|
|
171
|
-
# aggregate.
|
|
172
|
-
rm -f papers/*/markdown.md papers/*/convert_raw.json
|
|
169
|
+
# Re-runs flowa.convert (which uses anchorite for PDF chunking and
|
|
170
|
+
# builds pdf_index.pkl.zst). Drop this line to reuse the cached markdown
|
|
171
|
+
# + index and only redo extract + aggregate.
|
|
172
|
+
rm -f papers/*/markdown.md papers/*/convert_raw.json papers/*/pdf_index.pkl.zst
|
|
173
173
|
```
|
|
174
174
|
|
|
175
175
|
Then drive the pipeline. The demo's `scripts/start.ts` translates the
|
|
@@ -208,11 +208,13 @@ and not needed by anything downstream.
|
|
|
208
208
|
For papers whose source license blocks redistribution (CC-BY-NC-ND,
|
|
209
209
|
paywalled; see `fixtures/LICENSES.md` for the rule), do **not** delete
|
|
210
210
|
the whole `papers/{encodedDoi}/` directory — only delete `source.pdf`,
|
|
211
|
-
`markdown.md`,
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
211
|
+
`markdown.md`, `convert_raw.json`, and `pdf_index.pkl.zst`. The
|
|
212
|
+
`pdf_index.pkl.zst` embeds the PDF's extracted text (anchorite's char
|
|
213
|
+
index), so it carries the same copyright as `source.pdf` and must not
|
|
214
|
+
ship in the open-source repo. Keep `metadata.json` (the bibliographic
|
|
215
|
+
fields are factual data, not copyrightable) but replace its `abstract`
|
|
216
|
+
field with a sentinel string, so the omission reads as deliberate (not
|
|
217
|
+
a missing-data bug) when the literature view renders the row:
|
|
216
218
|
|
|
217
219
|
```bash
|
|
218
220
|
python3 -c "
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -18,9 +18,13 @@ from typing import Annotated
|
|
|
18
18
|
import uvicorn
|
|
19
19
|
from fastapi import APIRouter, Depends, FastAPI, HTTPException, Query, Request, status
|
|
20
20
|
from fastapi.middleware.cors import CORSMiddleware
|
|
21
|
-
from flowa.resolve import
|
|
21
|
+
from flowa.resolve import (
|
|
22
|
+
ResolvedCitations,
|
|
23
|
+
ResolveRequest,
|
|
24
|
+
load_pdf_index_from_storage,
|
|
25
|
+
resolve_citations,
|
|
26
|
+
)
|
|
22
27
|
from flowa.schema import VariantSpec
|
|
23
|
-
from flowa.storage import paper_url, read_bytes, read_text
|
|
24
28
|
from pydantic import BaseModel, Field
|
|
25
29
|
|
|
26
30
|
from .config import Settings
|
|
@@ -111,24 +115,15 @@ def resolve_citations_route(
|
|
|
111
115
|
) -> ResolvedCitations:
|
|
112
116
|
"""Align verbatim quotes to PDF bboxes.
|
|
113
117
|
|
|
114
|
-
Sync `def` so FastAPI auto-runs it in the threadpool —
|
|
115
|
-
|
|
118
|
+
Sync `def` so FastAPI auto-runs it in the threadpool — deserialising the
|
|
119
|
+
PdfIndex pickle and aligning quotes is CPU-bound and would block the
|
|
120
|
+
asyncio loop otherwise.
|
|
116
121
|
"""
|
|
117
122
|
base = str(settings.demo_data_dir)
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
except FileNotFoundError:
|
|
123
|
-
return None
|
|
124
|
-
|
|
125
|
-
def md_loader(doi: str) -> str | None:
|
|
126
|
-
try:
|
|
127
|
-
return read_text(paper_url(base, doi, 'markdown.md'))
|
|
128
|
-
except FileNotFoundError:
|
|
129
|
-
return None
|
|
130
|
-
|
|
131
|
-
return resolve_citations(body.citations, pdf_loader=pdf_loader, markdown_loader=md_loader)
|
|
123
|
+
return resolve_citations(
|
|
124
|
+
body.citations,
|
|
125
|
+
index_provider=lambda doi: load_pdf_index_from_storage(base, doi),
|
|
126
|
+
)
|
|
132
127
|
|
|
133
128
|
|
|
134
129
|
@asynccontextmanager
|
|
@@ -1,24 +1,13 @@
|
|
|
1
1
|
"""HTTP shape tests for /resolve-citations.
|
|
2
2
|
|
|
3
3
|
Library-level resolver behaviour is covered in flowa's `tests/test_resolve.py`;
|
|
4
|
-
here we just check that the route plumbs Settings →
|
|
5
|
-
correctly and returns the expected wire shape.
|
|
4
|
+
here we just check that the route plumbs Settings → index_provider →
|
|
5
|
+
flowa.resolve correctly and returns the expected wire shape.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
from pathlib import Path
|
|
9
|
-
|
|
10
8
|
from fastapi.testclient import TestClient
|
|
11
|
-
from flowa import resolve as flowa_resolve_module
|
|
12
|
-
from flowa.storage import encode_doi
|
|
13
|
-
|
|
14
|
-
from demo_gateway.config import Settings
|
|
15
|
-
|
|
16
9
|
|
|
17
|
-
|
|
18
|
-
paper_dir = data_dir / 'papers' / encode_doi(doi)
|
|
19
|
-
paper_dir.mkdir(parents=True, exist_ok=True)
|
|
20
|
-
(paper_dir / 'source.pdf').write_bytes(pdf)
|
|
21
|
-
(paper_dir / 'markdown.md').write_text(markdown)
|
|
10
|
+
import demo_gateway.main as demo_main
|
|
22
11
|
|
|
23
12
|
|
|
24
13
|
def test_post_resolve_citations_rejects_malformed_body(client: TestClient) -> None:
|
|
@@ -26,8 +15,8 @@ def test_post_resolve_citations_rejects_malformed_body(client: TestClient) -> No
|
|
|
26
15
|
assert response.status_code == 422
|
|
27
16
|
|
|
28
17
|
|
|
29
|
-
def
|
|
30
|
-
"""When
|
|
18
|
+
def test_post_resolve_citations_returns_errors_for_missing_index(client: TestClient) -> None:
|
|
19
|
+
"""When pdf_index.pkl.zst is absent, the DOI surfaces in `errors` rather than `resolved`."""
|
|
31
20
|
response = client.post(
|
|
32
21
|
'/resolve-citations',
|
|
33
22
|
json={'citations': [{'doi': '10.1/missing', 'quotes': ['anything']}]},
|
|
@@ -35,15 +24,11 @@ def test_post_resolve_citations_returns_errors_for_missing_pdfs(client: TestClie
|
|
|
35
24
|
assert response.status_code == 200
|
|
36
25
|
body = response.json()
|
|
37
26
|
assert body['resolved'] == {}
|
|
38
|
-
assert body['errors'] == {'10.1/missing': '
|
|
27
|
+
assert body['errors'] == {'10.1/missing': 'pdf_index not available'}
|
|
39
28
|
|
|
40
29
|
|
|
41
|
-
def test_post_resolve_citations_returns_resolved_bboxes(
|
|
42
|
-
|
|
43
|
-
settings: Settings,
|
|
44
|
-
monkeypatch,
|
|
45
|
-
) -> None:
|
|
46
|
-
"""When source.pdf exists, the route resolves quotes to bboxes via the library."""
|
|
30
|
+
def test_post_resolve_citations_returns_resolved_bboxes(client: TestClient, monkeypatch) -> None:
|
|
31
|
+
"""When the index loads, the route resolves quotes to bboxes via the library."""
|
|
47
32
|
|
|
48
33
|
class _FakeBbox:
|
|
49
34
|
def __init__(self, top: int, left: int, bottom: int, right: int) -> None:
|
|
@@ -53,16 +38,16 @@ def test_post_resolve_citations_returns_resolved_bboxes(
|
|
|
53
38
|
self.right = right
|
|
54
39
|
|
|
55
40
|
class _FakePdfIndex:
|
|
56
|
-
def __init__(self, _pdf_bytes: bytes, *, markdown: str | None = None) -> None:
|
|
57
|
-
pass
|
|
58
|
-
|
|
59
41
|
def resolve(self, quotes: list[str]) -> dict[str, list[tuple[int, _FakeBbox]]]:
|
|
60
42
|
# 0-indexed page from anchorite — the +1 boundary wrap in resolve.py
|
|
61
43
|
# turns this into page=1 on the wire.
|
|
62
44
|
return {q: [(0, _FakeBbox(top=10, left=20, bottom=30, right=40))] for q in quotes}
|
|
63
45
|
|
|
64
|
-
monkeypatch.setattr(
|
|
65
|
-
|
|
46
|
+
monkeypatch.setattr(
|
|
47
|
+
demo_main,
|
|
48
|
+
'load_pdf_index_from_storage',
|
|
49
|
+
lambda _base, doi: _FakePdfIndex() if doi == '10.1/present' else None,
|
|
50
|
+
)
|
|
66
51
|
|
|
67
52
|
response = client.post(
|
|
68
53
|
'/resolve-citations',
|
|
@@ -415,7 +415,7 @@ wheels = [
|
|
|
415
415
|
|
|
416
416
|
[[package]]
|
|
417
417
|
name = "flowapy"
|
|
418
|
-
version = "0.
|
|
418
|
+
version = "0.2.0"
|
|
419
419
|
source = { editable = "../../" }
|
|
420
420
|
dependencies = [
|
|
421
421
|
{ name = "anchorite" },
|
|
@@ -431,6 +431,7 @@ dependencies = [
|
|
|
431
431
|
{ name = "s3fs" },
|
|
432
432
|
{ name = "tenacity" },
|
|
433
433
|
{ name = "typer" },
|
|
434
|
+
{ name = "zstandard" },
|
|
434
435
|
]
|
|
435
436
|
|
|
436
437
|
[package.optional-dependencies]
|
|
@@ -466,6 +467,7 @@ requires-dist = [
|
|
|
466
467
|
{ name = "s3fs" },
|
|
467
468
|
{ name = "tenacity" },
|
|
468
469
|
{ name = "typer" },
|
|
470
|
+
{ name = "zstandard" },
|
|
469
471
|
]
|
|
470
472
|
provides-extras = ["anthropic", "bedrock", "google", "openai"]
|
|
471
473
|
|
|
@@ -2109,3 +2111,28 @@ sdist = { url = "https://files.pythonhosted.org/packages/30/21/093488dfc7cc8964d
|
|
|
2109
2111
|
wheels = [
|
|
2110
2112
|
{ url = "https://files.pythonhosted.org/packages/08/8a/0861bec20485572fbddf3dfba2910e38fe249796cb73ecdeb74e07eeb8d3/zipp-3.23.1-py3-none-any.whl", hash = "sha256:0b3596c50a5c700c9cb40ba8d86d9f2cc4807e9bedb06bcdf7fac85633e444dc", size = 10378, upload-time = "2026-04-13T23:21:45.386Z" },
|
|
2111
2113
|
]
|
|
2114
|
+
|
|
2115
|
+
[[package]]
|
|
2116
|
+
name = "zstandard"
|
|
2117
|
+
version = "0.25.0"
|
|
2118
|
+
source = { registry = "https://pypi.org/simple" }
|
|
2119
|
+
sdist = { url = "https://files.pythonhosted.org/packages/fd/aa/3e0508d5a5dd96529cdc5a97011299056e14c6505b678fd58938792794b1/zstandard-0.25.0.tar.gz", hash = "sha256:7713e1179d162cf5c7906da876ec2ccb9c3a9dcbdffef0cc7f70c3667a205f0b", size = 711513, upload-time = "2025-09-14T22:15:54.002Z" }
|
|
2120
|
+
wheels = [
|
|
2121
|
+
{ url = "https://files.pythonhosted.org/packages/35/0b/8df9c4ad06af91d39e94fa96cc010a24ac4ef1378d3efab9223cc8593d40/zstandard-0.25.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ec996f12524f88e151c339688c3897194821d7f03081ab35d31d1e12ec975e94", size = 795735, upload-time = "2025-09-14T22:17:26.042Z" },
|
|
2122
|
+
{ url = "https://files.pythonhosted.org/packages/3f/06/9ae96a3e5dcfd119377ba33d4c42a7d89da1efabd5cb3e366b156c45ff4d/zstandard-0.25.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a1a4ae2dec3993a32247995bdfe367fc3266da832d82f8438c8570f989753de1", size = 640440, upload-time = "2025-09-14T22:17:27.366Z" },
|
|
2123
|
+
{ url = "https://files.pythonhosted.org/packages/d9/14/933d27204c2bd404229c69f445862454dcc101cd69ef8c6068f15aaec12c/zstandard-0.25.0-cp313-cp313-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:e96594a5537722fdfb79951672a2a63aec5ebfb823e7560586f7484819f2a08f", size = 5343070, upload-time = "2025-09-14T22:17:28.896Z" },
|
|
2124
|
+
{ url = "https://files.pythonhosted.org/packages/6d/db/ddb11011826ed7db9d0e485d13df79b58586bfdec56e5c84a928a9a78c1c/zstandard-0.25.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bfc4e20784722098822e3eee42b8e576b379ed72cca4a7cb856ae733e62192ea", size = 5063001, upload-time = "2025-09-14T22:17:31.044Z" },
|
|
2125
|
+
{ url = "https://files.pythonhosted.org/packages/db/00/87466ea3f99599d02a5238498b87bf84a6348290c19571051839ca943777/zstandard-0.25.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:457ed498fc58cdc12fc48f7950e02740d4f7ae9493dd4ab2168a47c93c31298e", size = 5394120, upload-time = "2025-09-14T22:17:32.711Z" },
|
|
2126
|
+
{ url = "https://files.pythonhosted.org/packages/2b/95/fc5531d9c618a679a20ff6c29e2b3ef1d1f4ad66c5e161ae6ff847d102a9/zstandard-0.25.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:fd7a5004eb1980d3cefe26b2685bcb0b17989901a70a1040d1ac86f1d898c551", size = 5451230, upload-time = "2025-09-14T22:17:34.41Z" },
|
|
2127
|
+
{ url = "https://files.pythonhosted.org/packages/63/4b/e3678b4e776db00f9f7b2fe58e547e8928ef32727d7a1ff01dea010f3f13/zstandard-0.25.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8e735494da3db08694d26480f1493ad2cf86e99bdd53e8e9771b2752a5c0246a", size = 5547173, upload-time = "2025-09-14T22:17:36.084Z" },
|
|
2128
|
+
{ url = "https://files.pythonhosted.org/packages/4e/d5/ba05ed95c6b8ec30bd468dfeab20589f2cf709b5c940483e31d991f2ca58/zstandard-0.25.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3a39c94ad7866160a4a46d772e43311a743c316942037671beb264e395bdd611", size = 5046736, upload-time = "2025-09-14T22:17:37.891Z" },
|
|
2129
|
+
{ url = "https://files.pythonhosted.org/packages/50/d5/870aa06b3a76c73eced65c044b92286a3c4e00554005ff51962deef28e28/zstandard-0.25.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:172de1f06947577d3a3005416977cce6168f2261284c02080e7ad0185faeced3", size = 5576368, upload-time = "2025-09-14T22:17:40.206Z" },
|
|
2130
|
+
{ url = "https://files.pythonhosted.org/packages/5d/35/398dc2ffc89d304d59bc12f0fdd931b4ce455bddf7038a0a67733a25f550/zstandard-0.25.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:3c83b0188c852a47cd13ef3bf9209fb0a77fa5374958b8c53aaa699398c6bd7b", size = 4954022, upload-time = "2025-09-14T22:17:41.879Z" },
|
|
2131
|
+
{ url = "https://files.pythonhosted.org/packages/9a/5c/36ba1e5507d56d2213202ec2b05e8541734af5f2ce378c5d1ceaf4d88dc4/zstandard-0.25.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:1673b7199bbe763365b81a4f3252b8e80f44c9e323fc42940dc8843bfeaf9851", size = 5267889, upload-time = "2025-09-14T22:17:43.577Z" },
|
|
2132
|
+
{ url = "https://files.pythonhosted.org/packages/70/e8/2ec6b6fb7358b2ec0113ae202647ca7c0e9d15b61c005ae5225ad0995df5/zstandard-0.25.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:0be7622c37c183406f3dbf0cba104118eb16a4ea7359eeb5752f0794882fc250", size = 5433952, upload-time = "2025-09-14T22:17:45.271Z" },
|
|
2133
|
+
{ url = "https://files.pythonhosted.org/packages/7b/01/b5f4d4dbc59ef193e870495c6f1275f5b2928e01ff5a81fecb22a06e22fb/zstandard-0.25.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:5f5e4c2a23ca271c218ac025bd7d635597048b366d6f31f420aaeb715239fc98", size = 5814054, upload-time = "2025-09-14T22:17:47.08Z" },
|
|
2134
|
+
{ url = "https://files.pythonhosted.org/packages/b2/e5/fbd822d5c6f427cf158316d012c5a12f233473c2f9c5fe5ab1ae5d21f3d8/zstandard-0.25.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4f187a0bb61b35119d1926aee039524d1f93aaf38a9916b8c4b78ac8514a0aaf", size = 5360113, upload-time = "2025-09-14T22:17:48.893Z" },
|
|
2135
|
+
{ url = "https://files.pythonhosted.org/packages/8e/e0/69a553d2047f9a2c7347caa225bb3a63b6d7704ad74610cb7823baa08ed7/zstandard-0.25.0-cp313-cp313-win32.whl", hash = "sha256:7030defa83eef3e51ff26f0b7bfb229f0204b66fe18e04359ce3474ac33cbc09", size = 436936, upload-time = "2025-09-14T22:17:52.658Z" },
|
|
2136
|
+
{ url = "https://files.pythonhosted.org/packages/d9/82/b9c06c870f3bd8767c201f1edbdf9e8dc34be5b0fbc5682c4f80fe948475/zstandard-0.25.0-cp313-cp313-win_amd64.whl", hash = "sha256:1f830a0dac88719af0ae43b8b2d6aef487d437036468ef3c2ea59c51f9d55fd5", size = 506232, upload-time = "2025-09-14T22:17:50.402Z" },
|
|
2137
|
+
{ url = "https://files.pythonhosted.org/packages/d4/57/60c3c01243bb81d381c9916e2a6d9e149ab8627c0c7d7abb2d73384b3c0c/zstandard-0.25.0-cp313-cp313-win_arm64.whl", hash = "sha256:85304a43f4d513f5464ceb938aa02c1e78c2943b29f44a750b48b25ac999a049", size = 462671, upload-time = "2025-09-14T22:17:51.533Z" },
|
|
2138
|
+
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@flowajs/chat-service",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.4",
|
|
4
4
|
"description": "Stateless service that orchestrates LLM conversations over flowa artifacts.",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"type": "module",
|
|
@@ -71,7 +71,7 @@
|
|
|
71
71
|
"zod": "4.4.3"
|
|
72
72
|
},
|
|
73
73
|
"peerDependencies": {
|
|
74
|
-
"@ai-sdk/amazon-bedrock": "^4.0.
|
|
74
|
+
"@ai-sdk/amazon-bedrock": "^4.0.101",
|
|
75
75
|
"@ai-sdk/anthropic": "^3.0.0",
|
|
76
76
|
"@ai-sdk/google": "^3.0.0",
|
|
77
77
|
"@ai-sdk/google-vertex": "^4.0.0",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "flowapy"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.2.0"
|
|
4
4
|
description = "Variant literature assessment pipeline with AI extraction"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = "==3.13.*"
|
|
@@ -27,6 +27,7 @@ dependencies = [
|
|
|
27
27
|
"s3fs", # S3/MinIO support for fsspec
|
|
28
28
|
"tenacity",
|
|
29
29
|
"typer",
|
|
30
|
+
"zstandard",
|
|
30
31
|
]
|
|
31
32
|
|
|
32
33
|
[project.optional-dependencies]
|
|
@@ -15,7 +15,7 @@ from pydantic_ai import Agent, ModelRetry, NativeOutput, RunContext
|
|
|
15
15
|
from flowa.clinvar import format_clinvar_for_prompt, query_clinvar
|
|
16
16
|
from flowa.models import create_model, get_model_settings
|
|
17
17
|
from flowa.prompts import load_prompt_and_schema
|
|
18
|
-
from flowa.resolve import CitationQuery, resolve_citations
|
|
18
|
+
from flowa.resolve import CitationQuery, load_pdf_index_from_storage, resolve_citations
|
|
19
19
|
from flowa.schema import AGGREGATION_SCHEMA_VERSION, with_schema_version
|
|
20
20
|
from flowa.settings import ModelConfig, Settings
|
|
21
21
|
from flowa.storage import (
|
|
@@ -23,9 +23,7 @@ from flowa.storage import (
|
|
|
23
23
|
encode_doi,
|
|
24
24
|
exists,
|
|
25
25
|
paper_url,
|
|
26
|
-
read_bytes,
|
|
27
26
|
read_json,
|
|
28
|
-
read_text,
|
|
29
27
|
write_bytes,
|
|
30
28
|
write_json,
|
|
31
29
|
)
|
|
@@ -158,16 +156,14 @@ def create_aggregate_agent(
|
|
|
158
156
|
def resolve_aggregate_citations(
|
|
159
157
|
aggregate_dict: dict[str, Any],
|
|
160
158
|
paper_id_to_doi: dict[str, str],
|
|
161
|
-
|
|
162
|
-
markdown_cache: dict[str, str],
|
|
159
|
+
base: str,
|
|
163
160
|
metadata_cache: dict[str, dict[str, Any]],
|
|
164
161
|
) -> None:
|
|
165
162
|
"""Post-process aggregate output: resolve quotes to bboxes on claim citations.
|
|
166
163
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
quote) pair resolves to exactly one paper.
|
|
164
|
+
Loads each paper's pre-built `pdf_index.pkl.zst` via the same path the
|
|
165
|
+
gateway uses; the convert step that ran earlier in this pipeline wrote
|
|
166
|
+
the artifact, so it's guaranteed to be present.
|
|
171
167
|
"""
|
|
172
168
|
# Collect all (doi, quote) pairs, grouped by DOI.
|
|
173
169
|
doi_quotes: dict[str, list[str]] = {}
|
|
@@ -180,8 +176,7 @@ def resolve_aggregate_citations(
|
|
|
180
176
|
citations_input = [CitationQuery(doi=doi, quotes=quotes) for doi, quotes in doi_quotes.items()]
|
|
181
177
|
result = resolve_citations(
|
|
182
178
|
citations_input,
|
|
183
|
-
|
|
184
|
-
markdown_loader=markdown_cache.get,
|
|
179
|
+
index_provider=lambda doi: load_pdf_index_from_storage(base, doi),
|
|
185
180
|
)
|
|
186
181
|
|
|
187
182
|
# Attach resolved bboxes onto each claim's citations.
|
|
@@ -229,14 +224,11 @@ async def aggregate_evidence_async(
|
|
|
229
224
|
clinvar_data = query_clinvar(hgvs_c_full, ncbi_api_key)
|
|
230
225
|
clinvar_text = format_clinvar_for_prompt(clinvar_data)
|
|
231
226
|
|
|
232
|
-
# Load extractions and metadata for each paper. PDF bytes and
|
|
233
|
-
#
|
|
234
|
-
#
|
|
235
|
-
#
|
|
236
|
-
# surfaces as FileNotFoundError below.
|
|
227
|
+
# Load extractions and metadata for each paper. PDF bytes and markdown
|
|
228
|
+
# are NOT loaded here — the post-LLM citation resolver loads the paper's
|
|
229
|
+
# pre-built `pdf_index.pkl.zst` directly from storage, so this step only
|
|
230
|
+
# needs the LLM inputs.
|
|
237
231
|
evidence_extractions: list[dict[str, Any]] = []
|
|
238
|
-
pdf_bytes_cache: dict[str, bytes] = {}
|
|
239
|
-
markdown_cache: dict[str, str] = {}
|
|
240
232
|
metadata_cache: dict[str, dict[str, Any]] = {}
|
|
241
233
|
|
|
242
234
|
for doi in dois:
|
|
@@ -252,8 +244,6 @@ async def aggregate_evidence_async(
|
|
|
252
244
|
log.info('Skipping %s: variant not discussed', doi)
|
|
253
245
|
continue
|
|
254
246
|
|
|
255
|
-
pdf_bytes_cache[doi] = read_bytes(paper_url(base, doi, 'source.pdf'))
|
|
256
|
-
markdown_cache[doi] = read_text(paper_url(base, doi, 'markdown.md'))
|
|
257
247
|
metadata = read_json(paper_url(base, doi, 'metadata.json'))
|
|
258
248
|
metadata_cache[doi] = metadata
|
|
259
249
|
|
|
@@ -332,7 +322,7 @@ async def aggregate_evidence_async(
|
|
|
332
322
|
# Post-LLM: resolve quotes to bboxes, replace paper_id with DOI
|
|
333
323
|
aggregate_dict = output.model_dump()
|
|
334
324
|
with logfire.span('flowa.resolve_citations', paper_count=len(paper_id_to_doi)):
|
|
335
|
-
resolve_aggregate_citations(aggregate_dict, paper_id_to_doi,
|
|
325
|
+
resolve_aggregate_citations(aggregate_dict, paper_id_to_doi, base, metadata_cache)
|
|
336
326
|
|
|
337
327
|
# Store structured aggregation result
|
|
338
328
|
write_json(aggregation_url, with_schema_version(aggregate_dict, AGGREGATION_SCHEMA_VERSION))
|
|
@@ -17,9 +17,11 @@ from pydantic_ai import Agent
|
|
|
17
17
|
from pydantic_ai.messages import BinaryContent
|
|
18
18
|
|
|
19
19
|
from flowa.models import create_model, get_model_settings
|
|
20
|
+
from flowa.pdf_index_cache import build as build_pdf_index_payload
|
|
21
|
+
from flowa.pdf_index_cache import serialize as serialize_pdf_index_payload
|
|
20
22
|
from flowa.prompts import load_text_prompt
|
|
21
23
|
from flowa.settings import ModelConfig, Settings
|
|
22
|
-
from flowa.storage import exists, paper_url, read_bytes, write_bytes, write_text
|
|
24
|
+
from flowa.storage import exists, paper_url, read_bytes, read_text, write_bytes, write_text
|
|
23
25
|
|
|
24
26
|
log = logging.getLogger(__name__)
|
|
25
27
|
|
|
@@ -120,14 +122,22 @@ async def transcribe(
|
|
|
120
122
|
|
|
121
123
|
|
|
122
124
|
async def convert_paper_async(base: str, doi: str, model: ModelConfig, prompt_set: str = 'generic') -> None:
|
|
123
|
-
"""Convert a single paper's PDF to Markdown
|
|
125
|
+
"""Convert a single paper's PDF to Markdown and persist its `PdfIndex`.
|
|
124
126
|
|
|
125
|
-
Reads PDF from papers/{encoded_doi}/source.pdf
|
|
126
|
-
|
|
127
|
+
Reads PDF from papers/{encoded_doi}/source.pdf and writes
|
|
128
|
+
papers/{encoded_doi}/markdown.md plus papers/{encoded_doi}/pdf_index.pkl.zst
|
|
129
|
+
(consumed by the gateway's resolve endpoint).
|
|
130
|
+
|
|
131
|
+
Either artifact can be missing independently — if a previous run failed
|
|
132
|
+
or pre-dates the pdf_index step, the next call fills in only what's
|
|
133
|
+
missing without re-transcribing or re-building work that's already done.
|
|
127
134
|
"""
|
|
128
135
|
md_url = paper_url(base, doi, 'markdown.md')
|
|
136
|
+
index_url = paper_url(base, doi, 'pdf_index.pkl.zst')
|
|
137
|
+
md_needed = not exists(md_url)
|
|
138
|
+
index_needed = not exists(index_url)
|
|
129
139
|
|
|
130
|
-
if
|
|
140
|
+
if not md_needed and not index_needed:
|
|
131
141
|
log.info('Already converted: %s', md_url)
|
|
132
142
|
return
|
|
133
143
|
|
|
@@ -138,21 +148,36 @@ async def convert_paper_async(base: str, doi: str, model: ModelConfig, prompt_se
|
|
|
138
148
|
log.info('Skipping DOI %s: PDF not available', doi)
|
|
139
149
|
return
|
|
140
150
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
151
|
+
markdown: str | None = None
|
|
152
|
+
if md_needed:
|
|
153
|
+
log.info(
|
|
154
|
+
'Converting DOI %s (%d bytes, model: %s, chunk: %d pages)', doi, len(pdf_bytes), model.name, PAGES_PER_CHUNK
|
|
155
|
+
)
|
|
156
|
+
prompt = load_text_prompt('transcription', prompt_set)
|
|
157
|
+
t0 = time.monotonic()
|
|
158
|
+
result = await transcribe(pdf_bytes, model=model, prompt=prompt, page_count=PAGES_PER_CHUNK)
|
|
159
|
+
elapsed = time.monotonic() - t0
|
|
160
|
+
|
|
161
|
+
markdown = result.markdown
|
|
162
|
+
write_text(md_url, markdown)
|
|
163
|
+
write_bytes(paper_url(base, doi, 'convert_raw.json'), json.dumps(result.all_messages).encode())
|
|
164
|
+
log.info('Converted DOI %s: %d chars in %.1fs', doi, len(markdown), elapsed)
|
|
165
|
+
|
|
166
|
+
if index_needed:
|
|
167
|
+
# PdfIndex construction is CPU-bound (~8s on the deployed gateway
|
|
168
|
+
# hardware) and dominates per-call latency at `/api/v1/resolve` if
|
|
169
|
+
# rebuilt on every call. Pay the cost here once per paper and ship
|
|
170
|
+
# the result; see `flowa.pdf_index_cache` for the storage format.
|
|
171
|
+
# `asyncio.to_thread` keeps the rest of the convert pipeline (other
|
|
172
|
+
# papers being transcribed concurrently) unblocked.
|
|
173
|
+
if markdown is None: # index missing but markdown already on disk
|
|
174
|
+
markdown = read_text(md_url)
|
|
175
|
+
t0 = time.monotonic()
|
|
176
|
+
blob = await asyncio.to_thread(
|
|
177
|
+
lambda: serialize_pdf_index_payload(build_pdf_index_payload(pdf_bytes, markdown))
|
|
178
|
+
)
|
|
179
|
+
write_bytes(index_url, blob)
|
|
180
|
+
log.info('Wrote pdf_index for DOI %s: %.1f MB in %.1fs', doi, len(blob) / 1e6, time.monotonic() - t0)
|
|
156
181
|
|
|
157
182
|
|
|
158
183
|
def convert_paper(
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""Build, serialise, and load cached `PdfIndex` artifacts.
|
|
2
|
+
|
|
3
|
+
The gateway's per-call cost is dominated by `PdfIndex(pdf_bytes)` construction
|
|
4
|
+
— ~8s on the deployed gateway hardware for a typical paper, against ~300ms
|
|
5
|
+
for the actual quote alignment afterwards. To avoid paying that on every
|
|
6
|
+
`/api/v1/resolve` call, the pipeline persists the constructed index at
|
|
7
|
+
`papers/{encoded_doi}/pdf_index.pkl.zst` so the gateway can load instead of
|
|
8
|
+
rebuild.
|
|
9
|
+
|
|
10
|
+
On-the-wire format: zstd-compressed pickle of a single dict:
|
|
11
|
+
|
|
12
|
+
{
|
|
13
|
+
"format_version": int, # bumped when this module changes what it serialises
|
|
14
|
+
"source_pdf_sha256": str, # sha256 hex digest of source.pdf bytes
|
|
15
|
+
"pdf_index": PdfIndex,
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
The header fields let `deserialize` reject artifacts that are out of sync
|
|
19
|
+
with the current source.pdf or with the current cache format — a stale
|
|
20
|
+
pickle would silently produce wrong bboxes. The format version is *our*
|
|
21
|
+
contract over the persisted shape: bump it when this module changes which
|
|
22
|
+
fields it stores or how it stores them. It is not tied to anchorite's
|
|
23
|
+
release cadence — anchorite patch/minor releases that preserve the pickle
|
|
24
|
+
shape of `PdfIndex` deserialise fine.
|
|
25
|
+
|
|
26
|
+
zstd level 3 was chosen empirically (see specs/supplements.md): ~5x faster
|
|
27
|
+
compression than gzip -6 at comparable ratio (~1/5 of the pickled size),
|
|
28
|
+
and a touch faster decompression. The pipeline pays the compress cost once
|
|
29
|
+
per paper; the gateway pays the decompress cost on every cold load, so
|
|
30
|
+
optimising for both directions matters.
|
|
31
|
+
|
|
32
|
+
Anchorite documents that `PdfIndex` pickles cleanly (state is str/bytes/
|
|
33
|
+
list[int]/frozen dataclasses); no custom reducers needed.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
from __future__ import annotations
|
|
37
|
+
|
|
38
|
+
import hashlib
|
|
39
|
+
import pickle
|
|
40
|
+
from dataclasses import dataclass
|
|
41
|
+
|
|
42
|
+
import zstandard
|
|
43
|
+
from anchorite import PdfIndex
|
|
44
|
+
|
|
45
|
+
# Bump when the persisted shape changes: new fields, removed fields, semantic
|
|
46
|
+
# meaning changes. Anchorite version bumps that preserve `PdfIndex` pickle
|
|
47
|
+
# compatibility do NOT require a bump here — those deserialise correctly
|
|
48
|
+
# under the existing format. If a new anchorite release changes `PdfIndex`'s
|
|
49
|
+
# internals such that old pickles still load but produce different bboxes,
|
|
50
|
+
# bump this to force a re-backfill.
|
|
51
|
+
FORMAT_VERSION = 1
|
|
52
|
+
|
|
53
|
+
ZSTD_LEVEL = 3
|
|
54
|
+
PICKLE_PROTOCOL = pickle.HIGHEST_PROTOCOL
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class StaleIndexError(Exception):
|
|
58
|
+
"""Persisted artifact's header doesn't match the runtime expectation.
|
|
59
|
+
|
|
60
|
+
Raised by `deserialize` when the pickle was written under a different
|
|
61
|
+
`FORMAT_VERSION` or against a source.pdf with a different sha256. The
|
|
62
|
+
caller decides whether to rebuild or surface an error — this module
|
|
63
|
+
deliberately doesn't fall back, because silent rebuild masks pipeline
|
|
64
|
+
drift.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass(frozen=True)
|
|
69
|
+
class PdfIndexPayload:
|
|
70
|
+
"""In-memory view of the persisted artifact."""
|
|
71
|
+
|
|
72
|
+
format_version: int
|
|
73
|
+
source_pdf_sha256: str
|
|
74
|
+
pdf_index: PdfIndex
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def build(pdf_bytes: bytes, markdown: str) -> PdfIndexPayload:
|
|
78
|
+
"""Construct a PdfIndex pinned to its source PDF by sha256.
|
|
79
|
+
|
|
80
|
+
`markdown` is the paper's transcription, used by anchorite to denoise the
|
|
81
|
+
indexed PDF char string (drop running heads, page numbers, footnote
|
|
82
|
+
markers the LLM didn't transcribe), which improves quote alignment.
|
|
83
|
+
"""
|
|
84
|
+
# markdown is threaded through but not yet forwarded to PdfIndex: the
|
|
85
|
+
# anchorite-#19 markdown denoise drops entire pages of atoms when the
|
|
86
|
+
# markdown reorders content relative to PDF page order. Switch to
|
|
87
|
+
# `markdown=markdown` once the upstream fix lands — and bump
|
|
88
|
+
# FORMAT_VERSION + re-backfill, since the denoised index resolves quotes
|
|
89
|
+
# against a different cached char string (existing pickles would silently
|
|
90
|
+
# produce different bboxes).
|
|
91
|
+
return PdfIndexPayload(
|
|
92
|
+
format_version=FORMAT_VERSION,
|
|
93
|
+
source_pdf_sha256=hashlib.sha256(pdf_bytes).hexdigest(),
|
|
94
|
+
pdf_index=PdfIndex(pdf_bytes, markdown=None),
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def serialize(payload: PdfIndexPayload) -> bytes:
|
|
99
|
+
"""Pickle + zstd-compress for upload to S3."""
|
|
100
|
+
pkl = pickle.dumps(
|
|
101
|
+
{
|
|
102
|
+
'format_version': payload.format_version,
|
|
103
|
+
'source_pdf_sha256': payload.source_pdf_sha256,
|
|
104
|
+
'pdf_index': payload.pdf_index,
|
|
105
|
+
},
|
|
106
|
+
protocol=PICKLE_PROTOCOL,
|
|
107
|
+
)
|
|
108
|
+
return zstandard.ZstdCompressor(level=ZSTD_LEVEL).compress(pkl)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def deserialize(blob: bytes, *, expected_pdf_sha256: str | None = None) -> PdfIndexPayload:
|
|
112
|
+
"""Decompress + unpickle. Verifies the header before returning.
|
|
113
|
+
|
|
114
|
+
The runtime `FORMAT_VERSION` is always checked. The `source_pdf_sha256`
|
|
115
|
+
check is only performed when the caller supplies a value to compare
|
|
116
|
+
against — the gateway typically skips it (would require fetching source.pdf
|
|
117
|
+
just to hash it). Callers that already have the source bytes pass the
|
|
118
|
+
digest in to catch pipeline drift.
|
|
119
|
+
"""
|
|
120
|
+
pkl = zstandard.ZstdDecompressor().decompress(blob)
|
|
121
|
+
raw = pickle.loads(pkl)
|
|
122
|
+
payload = PdfIndexPayload(
|
|
123
|
+
format_version=raw['format_version'],
|
|
124
|
+
source_pdf_sha256=raw['source_pdf_sha256'],
|
|
125
|
+
pdf_index=raw['pdf_index'],
|
|
126
|
+
)
|
|
127
|
+
if payload.format_version != FORMAT_VERSION:
|
|
128
|
+
raise StaleIndexError(f'format version mismatch: pickle={payload.format_version!r} runtime={FORMAT_VERSION!r}')
|
|
129
|
+
if expected_pdf_sha256 is not None and payload.source_pdf_sha256 != expected_pdf_sha256:
|
|
130
|
+
raise StaleIndexError(
|
|
131
|
+
f'source.pdf hash mismatch: pickle={payload.source_pdf_sha256!r} actual={expected_pdf_sha256!r}'
|
|
132
|
+
)
|
|
133
|
+
return payload
|