flowapy 0.1.3__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (307) hide show
  1. {flowapy-0.1.3 → flowapy-0.2.0}/PKG-INFO +2 -1
  2. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/README.md +11 -9
  3. flowapy-0.2.0/examples/demo/fixtures/papers/10.1002%2Fhumu.23878/pdf_index.pkl.zst +0 -0
  4. flowapy-0.2.0/examples/demo/fixtures/papers/10.1016%2Fj.ymgmr.2024.101163/pdf_index.pkl.zst +0 -0
  5. flowapy-0.2.0/examples/demo/fixtures/papers/10.1038%2Fs41598-022-25914-8/pdf_index.pkl.zst +0 -0
  6. flowapy-0.2.0/examples/demo/fixtures/papers/10.1186%2Fs12881-019-0878-8/pdf_index.pkl.zst +0 -0
  7. flowapy-0.2.0/examples/demo/fixtures/papers/10.1186%2Fs13023-021-01817-1/pdf_index.pkl.zst +0 -0
  8. flowapy-0.2.0/examples/demo/fixtures/papers/10.1186%2Fs13023-021-02146-z/pdf_index.pkl.zst +0 -0
  9. flowapy-0.2.0/examples/demo/fixtures/papers/10.1186%2Fs13023-023-02848-6/pdf_index.pkl.zst +0 -0
  10. flowapy-0.2.0/examples/demo/fixtures/papers/10.1186%2Fs13052-019-0692-0/pdf_index.pkl.zst +0 -0
  11. flowapy-0.2.0/examples/demo/fixtures/papers/10.3389%2Ffcvm.2022.1061384/pdf_index.pkl.zst +0 -0
  12. flowapy-0.2.0/examples/demo/fixtures/papers/10.3389%2Ffcvm.2023.1261172/pdf_index.pkl.zst +0 -0
  13. flowapy-0.2.0/examples/demo/fixtures/papers/10.3389%2Ffimmu.2024.1336599/pdf_index.pkl.zst +0 -0
  14. flowapy-0.2.0/examples/demo/fixtures/papers/10.3389%2Ffped.2021.729824/pdf_index.pkl.zst +0 -0
  15. flowapy-0.2.0/examples/demo/fixtures/papers/10.3389%2Ffphar.2022.903488/pdf_index.pkl.zst +0 -0
  16. flowapy-0.2.0/examples/demo/fixtures/papers/10.3390%2Fijns11010016/pdf_index.pkl.zst +0 -0
  17. flowapy-0.2.0/examples/demo/fixtures/papers/10.3390%2Fijns6020031/pdf_index.pkl.zst +0 -0
  18. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/src/demo_gateway/main.py +13 -18
  19. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/tests/test_resolve.py +13 -28
  20. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/uv.lock +28 -1
  21. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/package.json +2 -2
  22. {flowapy-0.1.3 → flowapy-0.2.0}/pyproject.toml +2 -1
  23. {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/aggregate.py +11 -21
  24. {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/convert.py +45 -20
  25. flowapy-0.2.0/src/flowa/pdf_index_cache.py +133 -0
  26. {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/resolve.py +54 -69
  27. flowapy-0.2.0/tests/test_pdf_index_cache.py +120 -0
  28. flowapy-0.2.0/tests/test_resolve.py +158 -0
  29. {flowapy-0.1.3 → flowapy-0.2.0}/uv.lock +28 -1
  30. flowapy-0.1.3/tests/test_resolve.py +0 -207
  31. {flowapy-0.1.3 → flowapy-0.2.0}/.env.example +0 -0
  32. {flowapy-0.1.3 → flowapy-0.2.0}/.github/dependabot.yml +0 -0
  33. {flowapy-0.1.3 → flowapy-0.2.0}/.github/workflows/dependabot-auto-merge.yml +0 -0
  34. {flowapy-0.1.3 → flowapy-0.2.0}/.github/workflows/lint.yaml +0 -0
  35. {flowapy-0.1.3 → flowapy-0.2.0}/.github/workflows/release-chat-service.yaml +0 -0
  36. {flowapy-0.1.3 → flowapy-0.2.0}/.github/workflows/release-flowapy.yaml +0 -0
  37. {flowapy-0.1.3 → flowapy-0.2.0}/.github/workflows/release-react-viewer.yaml +0 -0
  38. {flowapy-0.1.3 → flowapy-0.2.0}/.gitignore +0 -0
  39. {flowapy-0.1.3 → flowapy-0.2.0}/.markdownlint.json +0 -0
  40. {flowapy-0.1.3 → flowapy-0.2.0}/.nvmrc +0 -0
  41. {flowapy-0.1.3 → flowapy-0.2.0}/.pre-commit-config.yaml +0 -0
  42. {flowapy-0.1.3 → flowapy-0.2.0}/.prettierignore +0 -0
  43. {flowapy-0.1.3 → flowapy-0.2.0}/Dockerfile +0 -0
  44. {flowapy-0.1.3 → flowapy-0.2.0}/LICENSE +0 -0
  45. {flowapy-0.1.3 → flowapy-0.2.0}/README.md +0 -0
  46. {flowapy-0.1.3 → flowapy-0.2.0}/docs/images/viewer.png +0 -0
  47. {flowapy-0.1.3 → flowapy-0.2.0}/examples/.gitkeep +0 -0
  48. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/.env.example +0 -0
  49. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/.gitignore +0 -0
  50. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/LICENSES.md +0 -0
  51. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/aggregation.json +0 -0
  52. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.1002%2Fhumu.23878.json +0 -0
  53. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.1016%2Fj.ymgmr.2024.101163.json +0 -0
  54. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.1038%2Fs41598-022-25914-8.json +0 -0
  55. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.1186%2Fs12881-019-0878-8.json +0 -0
  56. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.1186%2Fs13023-021-01817-1.json +0 -0
  57. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.1186%2Fs13023-021-02146-z.json +0 -0
  58. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.1186%2Fs13023-023-02848-6.json +0 -0
  59. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.1186%2Fs13052-019-0692-0.json +0 -0
  60. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.3389%2Ffcvm.2022.1061384.json +0 -0
  61. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.3389%2Ffcvm.2023.1261172.json +0 -0
  62. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.3389%2Ffimmu.2024.1336599.json +0 -0
  63. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.3389%2Ffped.2021.729824.json +0 -0
  64. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.3389%2Ffphar.2022.903488.json +0 -0
  65. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.3390%2Fijns11010016.json +0 -0
  66. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/extractions/10.3390%2Fijns6020031.json +0 -0
  67. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/query.json +0 -0
  68. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/runs/cfc0186a7b7e46eb802a516b86ec207f/progress.jsonl +0 -0
  69. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/assessments/NM_000152_5-c_1935C_A/variant_details.json +0 -0
  70. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1002%2Fajmg.a.61481/metadata.json +0 -0
  71. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1002%2Fhumu.23878/markdown.md +0 -0
  72. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1002%2Fhumu.23878/metadata.json +0 -0
  73. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1002%2Fhumu.23878/source.pdf +0 -0
  74. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1016%2Fj.ejmg.2020.103997/metadata.json +0 -0
  75. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1016%2Fj.nmd.2022.02.002/metadata.json +0 -0
  76. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1016%2Fj.tjog.2022.07.008/metadata.json +0 -0
  77. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1016%2Fj.ymgmr.2024.101163/markdown.md +0 -0
  78. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1016%2Fj.ymgmr.2024.101163/metadata.json +0 -0
  79. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1016%2Fj.ymgmr.2024.101163/source.pdf +0 -0
  80. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1038%2Fs41598-022-25914-8/markdown.md +0 -0
  81. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1038%2Fs41598-022-25914-8/metadata.json +0 -0
  82. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1038%2Fs41598-022-25914-8/source.pdf +0 -0
  83. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1093%2Fhmg%2Fddz218/metadata.json +0 -0
  84. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1136%2Fjmg-2022-108675/metadata.json +0 -0
  85. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs12881-019-0878-8/markdown.md +0 -0
  86. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs12881-019-0878-8/metadata.json +0 -0
  87. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs12881-019-0878-8/source.pdf +0 -0
  88. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13023-021-01817-1/markdown.md +0 -0
  89. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13023-021-01817-1/metadata.json +0 -0
  90. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13023-021-01817-1/source.pdf +0 -0
  91. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13023-021-02146-z/markdown.md +0 -0
  92. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13023-021-02146-z/metadata.json +0 -0
  93. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13023-021-02146-z/source.pdf +0 -0
  94. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13023-023-02848-6/markdown.md +0 -0
  95. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13023-023-02848-6/metadata.json +0 -0
  96. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13023-023-02848-6/source.pdf +0 -0
  97. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13052-019-0692-0/markdown.md +0 -0
  98. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13052-019-0692-0/metadata.json +0 -0
  99. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.1186%2Fs13052-019-0692-0/source.pdf +0 -0
  100. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffcvm.2022.1061384/markdown.md +0 -0
  101. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffcvm.2022.1061384/metadata.json +0 -0
  102. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffcvm.2022.1061384/source.pdf +0 -0
  103. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffcvm.2023.1261172/markdown.md +0 -0
  104. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffcvm.2023.1261172/metadata.json +0 -0
  105. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffcvm.2023.1261172/source.pdf +0 -0
  106. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffimmu.2024.1336599/markdown.md +0 -0
  107. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffimmu.2024.1336599/metadata.json +0 -0
  108. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffimmu.2024.1336599/source.pdf +0 -0
  109. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffped.2021.729824/markdown.md +0 -0
  110. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffped.2021.729824/metadata.json +0 -0
  111. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffped.2021.729824/source.pdf +0 -0
  112. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffphar.2022.903488/markdown.md +0 -0
  113. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffphar.2022.903488/metadata.json +0 -0
  114. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3389%2Ffphar.2022.903488/source.pdf +0 -0
  115. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3390%2Fijns11010016/markdown.md +0 -0
  116. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3390%2Fijns11010016/metadata.json +0 -0
  117. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3390%2Fijns11010016/source.pdf +0 -0
  118. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3390%2Fijns6020031/markdown.md +0 -0
  119. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3390%2Fijns6020031/metadata.json +0 -0
  120. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/fixtures/papers/10.3390%2Fijns6020031/source.pdf +0 -0
  121. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/next-env.d.ts +0 -0
  122. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/next.config.mjs +0 -0
  123. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/package.json +0 -0
  124. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/postcss.config.cjs +0 -0
  125. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/public/favicon.svg +0 -0
  126. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/scripts/chat-service.ts +0 -0
  127. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/scripts/copy-pdfjs-assets.ts +0 -0
  128. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/scripts/exercise-llm.ts +0 -0
  129. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/scripts/start.ts +0 -0
  130. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/components/literature/LiteratureView.tsx +0 -0
  131. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/components/literature/PaperStatusGroup.tsx +0 -0
  132. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/components/literature/ProgressLog.tsx +0 -0
  133. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/components/literature/matchFilename.ts +0 -0
  134. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/db/migrate.ts +0 -0
  135. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/db/schema.sql +0 -0
  136. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/lib/aggregate.ts +0 -0
  137. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/lib/chatSessionClient.ts +0 -0
  138. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/lib/citationResolverClient.ts +0 -0
  139. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/lib/demoConfig.ts +0 -0
  140. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/lib/papers.ts +0 -0
  141. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/lib/progressEvents.ts +0 -0
  142. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/lib/runs.ts +0 -0
  143. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/lib/triageBackendClient.ts +0 -0
  144. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/lib/triageDb.ts +0 -0
  145. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/lib/variantId.ts +0 -0
  146. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/_app.tsx +0 -0
  147. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/aggregate/[variantId]/[category].ts +0 -0
  148. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/edit-drafts/[variantId]/[category]/[version].ts +0 -0
  149. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/edit-drafts/[variantId]/[category]/index.ts +0 -0
  150. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/papers/[doi]/pdf.ts +0 -0
  151. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/papers/index.ts +0 -0
  152. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/runs/[variantId]/[runId]/progress.ts +0 -0
  153. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/runs/index.ts +0 -0
  154. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/runs/latest.ts +0 -0
  155. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/triage/claim.ts +0 -0
  156. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/triage/comment.ts +0 -0
  157. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/triage/paper-done.ts +0 -0
  158. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/api/triage/snapshot/[variantId]/[category]/[version].ts +0 -0
  159. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/index.tsx +0 -0
  160. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/variants/[variantId].tsx +0 -0
  161. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/pages/viewer/[variantId]/[category].tsx +0 -0
  162. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/src/styles/globals.css +0 -0
  163. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/tailwind.config.ts +0 -0
  164. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/LiteratureView.test.tsx +0 -0
  165. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/ProgressLog.test.tsx +0 -0
  166. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/aggregate.test.ts +0 -0
  167. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/chat-service.test.ts +0 -0
  168. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/index-page.test.tsx +0 -0
  169. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/matchFilename.test.ts +0 -0
  170. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/papers-pdf-upload.test.ts +0 -0
  171. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/papers-route.test.ts +0 -0
  172. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/papers.test.ts +0 -0
  173. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/progress-route.test.ts +0 -0
  174. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/runs-latest-route.test.ts +0 -0
  175. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/runs-route.test.ts +0 -0
  176. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/runs.test.ts +0 -0
  177. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/setup.ts +0 -0
  178. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/triage.test.ts +0 -0
  179. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/test/variantId.test.ts +0 -0
  180. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/tsconfig.json +0 -0
  181. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo/vitest.config.ts +0 -0
  182. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/README.md +0 -0
  183. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/pyproject.toml +0 -0
  184. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/src/demo_gateway/__init__.py +0 -0
  185. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/src/demo_gateway/config.py +0 -0
  186. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/src/demo_gateway/progress.py +0 -0
  187. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/src/demo_gateway/runs.py +0 -0
  188. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/tests/__init__.py +0 -0
  189. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/tests/conftest.py +0 -0
  190. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/tests/test_main.py +0 -0
  191. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/tests/test_progress.py +0 -0
  192. {flowapy-0.1.3 → flowapy-0.2.0}/examples/demo-gateway/tests/test_runs.py +0 -0
  193. {flowapy-0.1.3 → flowapy-0.2.0}/package.json +0 -0
  194. {flowapy-0.1.3 → flowapy-0.2.0}/packages/.gitkeep +0 -0
  195. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/Dockerfile +0 -0
  196. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/LICENSE +0 -0
  197. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/README.md +0 -0
  198. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/artifact.ts +0 -0
  199. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/audit.ts +0 -0
  200. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/auth/jwt.ts +0 -0
  201. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/auth/oidc.ts +0 -0
  202. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/chat.ts +0 -0
  203. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/cli.ts +0 -0
  204. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/config.ts +0 -0
  205. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/index.ts +0 -0
  206. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/instrumentation.ts +0 -0
  207. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/llm/anthropic.ts +0 -0
  208. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/llm/bedrock.ts +0 -0
  209. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/llm/factory.ts +0 -0
  210. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/llm/google-gla.ts +0 -0
  211. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/llm/google-vertex.ts +0 -0
  212. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/llm/interface.ts +0 -0
  213. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/llm/openai.ts +0 -0
  214. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/prompts.ts +0 -0
  215. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/server.ts +0 -0
  216. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/session.ts +0 -0
  217. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/storage/factory.ts +0 -0
  218. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/storage/fs.ts +0 -0
  219. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/storage/gcs.ts +0 -0
  220. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/storage/interface.ts +0 -0
  221. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/storage/s3.ts +0 -0
  222. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/storage-keys.ts +0 -0
  223. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/telemetry.ts +0 -0
  224. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/text.ts +0 -0
  225. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/src/yaml.ts +0 -0
  226. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/test/chat.test.ts +0 -0
  227. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/test/generic-prompt.test.ts +0 -0
  228. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/test/llm-factory.test.ts +0 -0
  229. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/test/oidc.test.ts +0 -0
  230. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/test/paper-cache.test.ts +0 -0
  231. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/test/storage-fs.test.ts +0 -0
  232. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/test/storage-gcs.test.ts +0 -0
  233. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/test/storage-s3.test.ts +0 -0
  234. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/test/text.test.ts +0 -0
  235. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/test/yaml.test.ts +0 -0
  236. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/tsconfig.build.json +0 -0
  237. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/tsconfig.json +0 -0
  238. {flowapy-0.1.3 → flowapy-0.2.0}/packages/chat-service/vitest.config.ts +0 -0
  239. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/LICENSE +0 -0
  240. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/README.md +0 -0
  241. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/package.json +0 -0
  242. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/citations/sanitize.test.ts +0 -0
  243. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/citations/sanitize.ts +0 -0
  244. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/citations/types.ts +0 -0
  245. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/index.ts +0 -0
  246. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/llm-content/LlmContent.test.tsx +0 -0
  247. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/llm-content/LlmContent.tsx +0 -0
  248. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/pdf-viewer/PdfHighlightViewer.test.tsx +0 -0
  249. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/pdf-viewer/PdfHighlightViewer.tsx +0 -0
  250. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/pdf-viewer/types.ts +0 -0
  251. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/styles.css +0 -0
  252. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/ChatDrawer.tsx +0 -0
  253. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/ChatSection.tsx +0 -0
  254. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/ClaimList.tsx +0 -0
  255. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/EvidenceViewerShell.test.tsx +0 -0
  256. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/EvidenceViewerShell.tsx +0 -0
  257. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/FocusCard.tsx +0 -0
  258. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/PaperHeader.tsx +0 -0
  259. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/PaperRail.test.tsx +0 -0
  260. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/PaperRail.tsx +0 -0
  261. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/SynthesisPanel.tsx +0 -0
  262. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/backend.ts +0 -0
  263. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/citation-resolver.ts +0 -0
  264. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/citation-utils.test.ts +0 -0
  265. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/citation-utils.ts +0 -0
  266. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/claim-refs.test.ts +0 -0
  267. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/claim-refs.ts +0 -0
  268. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/keyboard.test.ts +0 -0
  269. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/keyboard.ts +0 -0
  270. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/store.test.ts +0 -0
  271. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/store.ts +0 -0
  272. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/src/triage/types.ts +0 -0
  273. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/tailwind.config.ts +0 -0
  274. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/tsconfig.json +0 -0
  275. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/tsup.config.ts +0 -0
  276. {flowapy-0.1.3 → flowapy-0.2.0}/packages/react-viewer/vitest.config.ts +0 -0
  277. {flowapy-0.1.3 → flowapy-0.2.0}/pnpm-lock.yaml +0 -0
  278. {flowapy-0.1.3 → flowapy-0.2.0}/pnpm-workspace.yaml +0 -0
  279. {flowapy-0.1.3 → flowapy-0.2.0}/prompts/generic/aggregation_edit_prompt.txt +0 -0
  280. {flowapy-0.1.3 → flowapy-0.2.0}/prompts/generic/aggregation_edit_schema.ts +0 -0
  281. {flowapy-0.1.3 → flowapy-0.2.0}/prompts/generic/aggregation_prompt.txt +0 -0
  282. {flowapy-0.1.3 → flowapy-0.2.0}/prompts/generic/aggregation_schema.py +0 -0
  283. {flowapy-0.1.3 → flowapy-0.2.0}/prompts/generic/extraction_prompt.txt +0 -0
  284. {flowapy-0.1.3 → flowapy-0.2.0}/prompts/generic/extraction_schema.py +0 -0
  285. {flowapy-0.1.3 → flowapy-0.2.0}/prompts/generic/transcription_prompt.txt +0 -0
  286. {flowapy-0.1.3 → flowapy-0.2.0}/prompts/package.json +0 -0
  287. {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/__init__.py +0 -0
  288. {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/artifact.py +0 -0
  289. {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/cli.py +0 -0
  290. {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/clinvar.py +0 -0
  291. {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/download.py +0 -0
  292. {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/extract.py +0 -0
  293. {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/http_retry.py +0 -0
  294. {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/models.py +0 -0
  295. {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/normalize.py +0 -0
  296. {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/progress.py +0 -0
  297. {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/prompts/__init__.py +0 -0
  298. {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/py.typed +0 -0
  299. {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/query.py +0 -0
  300. {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/run.py +0 -0
  301. {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/schema.py +0 -0
  302. {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/settings.py +0 -0
  303. {flowapy-0.1.3 → flowapy-0.2.0}/src/flowa/storage.py +0 -0
  304. {flowapy-0.1.3 → flowapy-0.2.0}/tests/__init__.py +0 -0
  305. {flowapy-0.1.3 → flowapy-0.2.0}/tests/test_progress.py +0 -0
  306. {flowapy-0.1.3 → flowapy-0.2.0}/tests/test_prompts.py +0 -0
  307. {flowapy-0.1.3 → flowapy-0.2.0}/tsconfig.base.json +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: flowapy
3
- Version: 0.1.3
3
+ Version: 0.2.0
4
4
  Summary: Variant literature assessment pipeline with AI extraction
5
5
  Project-URL: Homepage, https://github.com/populationgenomics/flowa
6
6
  Project-URL: Source, https://github.com/populationgenomics/flowa
@@ -47,6 +47,7 @@ Requires-Dist: pypdf
47
47
  Requires-Dist: s3fs
48
48
  Requires-Dist: tenacity
49
49
  Requires-Dist: typer
50
+ Requires-Dist: zstandard
50
51
  Provides-Extra: anthropic
51
52
  Requires-Dist: pydantic-ai-slim[anthropic]==1.101.0; extra == 'anthropic'
52
53
  Provides-Extra: bedrock
@@ -166,10 +166,10 @@ VARIANT=NM_001035_3-c_14174A_G
166
166
  rm -f assessments/$VARIANT/aggregation.json \
167
167
  assessments/$VARIANT/aggregation_raw.json
168
168
  rm -rf assessments/$VARIANT/extractions/ assessments/$VARIANT/runs/
169
- # Re-runs flowa.convert (which uses anchorite for PDF chunking).
170
- # Drop this line to reuse the cached markdown and only redo extract +
171
- # aggregate.
172
- rm -f papers/*/markdown.md papers/*/convert_raw.json
169
+ # Re-runs flowa.convert (which uses anchorite for PDF chunking and
170
+ # builds pdf_index.pkl.zst). Drop this line to reuse the cached markdown
171
+ # + index and only redo extract + aggregate.
172
+ rm -f papers/*/markdown.md papers/*/convert_raw.json papers/*/pdf_index.pkl.zst
173
173
  ```
174
174
 
175
175
  Then drive the pipeline. The demo's `scripts/start.ts` translates the
@@ -208,11 +208,13 @@ and not needed by anything downstream.
208
208
  For papers whose source license blocks redistribution (CC-BY-NC-ND,
209
209
  paywalled; see `fixtures/LICENSES.md` for the rule), do **not** delete
210
210
  the whole `papers/{encodedDoi}/` directory — only delete `source.pdf`,
211
- `markdown.md`, and `convert_raw.json`. Keep `metadata.json` (the
212
- bibliographic fields are factual data, not copyrightable) but replace
213
- its `abstract` field with a sentinel string, so the omission reads as
214
- deliberate (not a missing-data bug) when the literature view renders
215
- the row:
211
+ `markdown.md`, `convert_raw.json`, and `pdf_index.pkl.zst`. The
212
+ `pdf_index.pkl.zst` embeds the PDF's extracted text (anchorite's char
213
+ index), so it carries the same copyright as `source.pdf` and must not
214
+ ship in the open-source repo. Keep `metadata.json` (the bibliographic
215
+ fields are factual data, not copyrightable) but replace its `abstract`
216
+ field with a sentinel string, so the omission reads as deliberate (not
217
+ a missing-data bug) when the literature view renders the row:
216
218
 
217
219
  ```bash
218
220
  python3 -c "
@@ -18,9 +18,13 @@ from typing import Annotated
18
18
  import uvicorn
19
19
  from fastapi import APIRouter, Depends, FastAPI, HTTPException, Query, Request, status
20
20
  from fastapi.middleware.cors import CORSMiddleware
21
- from flowa.resolve import ResolvedCitations, ResolveRequest, resolve_citations
21
+ from flowa.resolve import (
22
+ ResolvedCitations,
23
+ ResolveRequest,
24
+ load_pdf_index_from_storage,
25
+ resolve_citations,
26
+ )
22
27
  from flowa.schema import VariantSpec
23
- from flowa.storage import paper_url, read_bytes, read_text
24
28
  from pydantic import BaseModel, Field
25
29
 
26
30
  from .config import Settings
@@ -111,24 +115,15 @@ def resolve_citations_route(
111
115
  ) -> ResolvedCitations:
112
116
  """Align verbatim quotes to PDF bboxes.
113
117
 
114
- Sync `def` so FastAPI auto-runs it in the threadpool — anchorite's PDF
115
- parsing is CPU-bound and would block the asyncio loop otherwise.
118
+ Sync `def` so FastAPI auto-runs it in the threadpool — deserialising the
119
+ PdfIndex pickle and aligning quotes is CPU-bound and would block the
120
+ asyncio loop otherwise.
116
121
  """
117
122
  base = str(settings.demo_data_dir)
118
-
119
- def pdf_loader(doi: str) -> bytes | None:
120
- try:
121
- return read_bytes(paper_url(base, doi, 'source.pdf'))
122
- except FileNotFoundError:
123
- return None
124
-
125
- def md_loader(doi: str) -> str | None:
126
- try:
127
- return read_text(paper_url(base, doi, 'markdown.md'))
128
- except FileNotFoundError:
129
- return None
130
-
131
- return resolve_citations(body.citations, pdf_loader=pdf_loader, markdown_loader=md_loader)
123
+ return resolve_citations(
124
+ body.citations,
125
+ index_provider=lambda doi: load_pdf_index_from_storage(base, doi),
126
+ )
132
127
 
133
128
 
134
129
  @asynccontextmanager
@@ -1,24 +1,13 @@
1
1
  """HTTP shape tests for /resolve-citations.
2
2
 
3
3
  Library-level resolver behaviour is covered in flowa's `tests/test_resolve.py`;
4
- here we just check that the route plumbs Settings → loader flowa.resolve
5
- correctly and returns the expected wire shape.
4
+ here we just check that the route plumbs Settings → index_provider
5
+ flowa.resolve correctly and returns the expected wire shape.
6
6
  """
7
7
 
8
- from pathlib import Path
9
-
10
8
  from fastapi.testclient import TestClient
11
- from flowa import resolve as flowa_resolve_module
12
- from flowa.storage import encode_doi
13
-
14
- from demo_gateway.config import Settings
15
-
16
9
 
17
- def _write_fake_paper(data_dir: Path, doi: str, pdf: bytes = b'fake-pdf', markdown: str = '# fake md') -> None:
18
- paper_dir = data_dir / 'papers' / encode_doi(doi)
19
- paper_dir.mkdir(parents=True, exist_ok=True)
20
- (paper_dir / 'source.pdf').write_bytes(pdf)
21
- (paper_dir / 'markdown.md').write_text(markdown)
10
+ import demo_gateway.main as demo_main
22
11
 
23
12
 
24
13
  def test_post_resolve_citations_rejects_malformed_body(client: TestClient) -> None:
@@ -26,8 +15,8 @@ def test_post_resolve_citations_rejects_malformed_body(client: TestClient) -> No
26
15
  assert response.status_code == 422
27
16
 
28
17
 
29
- def test_post_resolve_citations_returns_errors_for_missing_pdfs(client: TestClient) -> None:
30
- """When source.pdf is absent, the DOI surfaces in `errors` rather than `resolved`."""
18
+ def test_post_resolve_citations_returns_errors_for_missing_index(client: TestClient) -> None:
19
+ """When pdf_index.pkl.zst is absent, the DOI surfaces in `errors` rather than `resolved`."""
31
20
  response = client.post(
32
21
  '/resolve-citations',
33
22
  json={'citations': [{'doi': '10.1/missing', 'quotes': ['anything']}]},
@@ -35,15 +24,11 @@ def test_post_resolve_citations_returns_errors_for_missing_pdfs(client: TestClie
35
24
  assert response.status_code == 200
36
25
  body = response.json()
37
26
  assert body['resolved'] == {}
38
- assert body['errors'] == {'10.1/missing': 'source.pdf not found'}
27
+ assert body['errors'] == {'10.1/missing': 'pdf_index not available'}
39
28
 
40
29
 
41
- def test_post_resolve_citations_returns_resolved_bboxes(
42
- client: TestClient,
43
- settings: Settings,
44
- monkeypatch,
45
- ) -> None:
46
- """When source.pdf exists, the route resolves quotes to bboxes via the library."""
30
+ def test_post_resolve_citations_returns_resolved_bboxes(client: TestClient, monkeypatch) -> None:
31
+ """When the index loads, the route resolves quotes to bboxes via the library."""
47
32
 
48
33
  class _FakeBbox:
49
34
  def __init__(self, top: int, left: int, bottom: int, right: int) -> None:
@@ -53,16 +38,16 @@ def test_post_resolve_citations_returns_resolved_bboxes(
53
38
  self.right = right
54
39
 
55
40
  class _FakePdfIndex:
56
- def __init__(self, _pdf_bytes: bytes, *, markdown: str | None = None) -> None:
57
- pass
58
-
59
41
  def resolve(self, quotes: list[str]) -> dict[str, list[tuple[int, _FakeBbox]]]:
60
42
  # 0-indexed page from anchorite — the +1 boundary wrap in resolve.py
61
43
  # turns this into page=1 on the wire.
62
44
  return {q: [(0, _FakeBbox(top=10, left=20, bottom=30, right=40))] for q in quotes}
63
45
 
64
- monkeypatch.setattr(flowa_resolve_module, 'PdfIndex', _FakePdfIndex)
65
- _write_fake_paper(settings.demo_data_dir, '10.1/present')
46
+ monkeypatch.setattr(
47
+ demo_main,
48
+ 'load_pdf_index_from_storage',
49
+ lambda _base, doi: _FakePdfIndex() if doi == '10.1/present' else None,
50
+ )
66
51
 
67
52
  response = client.post(
68
53
  '/resolve-citations',
@@ -415,7 +415,7 @@ wheels = [
415
415
 
416
416
  [[package]]
417
417
  name = "flowapy"
418
- version = "0.1.3"
418
+ version = "0.2.0"
419
419
  source = { editable = "../../" }
420
420
  dependencies = [
421
421
  { name = "anchorite" },
@@ -431,6 +431,7 @@ dependencies = [
431
431
  { name = "s3fs" },
432
432
  { name = "tenacity" },
433
433
  { name = "typer" },
434
+ { name = "zstandard" },
434
435
  ]
435
436
 
436
437
  [package.optional-dependencies]
@@ -466,6 +467,7 @@ requires-dist = [
466
467
  { name = "s3fs" },
467
468
  { name = "tenacity" },
468
469
  { name = "typer" },
470
+ { name = "zstandard" },
469
471
  ]
470
472
  provides-extras = ["anthropic", "bedrock", "google", "openai"]
471
473
 
@@ -2109,3 +2111,28 @@ sdist = { url = "https://files.pythonhosted.org/packages/30/21/093488dfc7cc8964d
2109
2111
  wheels = [
2110
2112
  { url = "https://files.pythonhosted.org/packages/08/8a/0861bec20485572fbddf3dfba2910e38fe249796cb73ecdeb74e07eeb8d3/zipp-3.23.1-py3-none-any.whl", hash = "sha256:0b3596c50a5c700c9cb40ba8d86d9f2cc4807e9bedb06bcdf7fac85633e444dc", size = 10378, upload-time = "2026-04-13T23:21:45.386Z" },
2111
2113
  ]
2114
+
2115
+ [[package]]
2116
+ name = "zstandard"
2117
+ version = "0.25.0"
2118
+ source = { registry = "https://pypi.org/simple" }
2119
+ sdist = { url = "https://files.pythonhosted.org/packages/fd/aa/3e0508d5a5dd96529cdc5a97011299056e14c6505b678fd58938792794b1/zstandard-0.25.0.tar.gz", hash = "sha256:7713e1179d162cf5c7906da876ec2ccb9c3a9dcbdffef0cc7f70c3667a205f0b", size = 711513, upload-time = "2025-09-14T22:15:54.002Z" }
2120
+ wheels = [
2121
+ { url = "https://files.pythonhosted.org/packages/35/0b/8df9c4ad06af91d39e94fa96cc010a24ac4ef1378d3efab9223cc8593d40/zstandard-0.25.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ec996f12524f88e151c339688c3897194821d7f03081ab35d31d1e12ec975e94", size = 795735, upload-time = "2025-09-14T22:17:26.042Z" },
2122
+ { url = "https://files.pythonhosted.org/packages/3f/06/9ae96a3e5dcfd119377ba33d4c42a7d89da1efabd5cb3e366b156c45ff4d/zstandard-0.25.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a1a4ae2dec3993a32247995bdfe367fc3266da832d82f8438c8570f989753de1", size = 640440, upload-time = "2025-09-14T22:17:27.366Z" },
2123
+ { url = "https://files.pythonhosted.org/packages/d9/14/933d27204c2bd404229c69f445862454dcc101cd69ef8c6068f15aaec12c/zstandard-0.25.0-cp313-cp313-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:e96594a5537722fdfb79951672a2a63aec5ebfb823e7560586f7484819f2a08f", size = 5343070, upload-time = "2025-09-14T22:17:28.896Z" },
2124
+ { url = "https://files.pythonhosted.org/packages/6d/db/ddb11011826ed7db9d0e485d13df79b58586bfdec56e5c84a928a9a78c1c/zstandard-0.25.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bfc4e20784722098822e3eee42b8e576b379ed72cca4a7cb856ae733e62192ea", size = 5063001, upload-time = "2025-09-14T22:17:31.044Z" },
2125
+ { url = "https://files.pythonhosted.org/packages/db/00/87466ea3f99599d02a5238498b87bf84a6348290c19571051839ca943777/zstandard-0.25.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:457ed498fc58cdc12fc48f7950e02740d4f7ae9493dd4ab2168a47c93c31298e", size = 5394120, upload-time = "2025-09-14T22:17:32.711Z" },
2126
+ { url = "https://files.pythonhosted.org/packages/2b/95/fc5531d9c618a679a20ff6c29e2b3ef1d1f4ad66c5e161ae6ff847d102a9/zstandard-0.25.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:fd7a5004eb1980d3cefe26b2685bcb0b17989901a70a1040d1ac86f1d898c551", size = 5451230, upload-time = "2025-09-14T22:17:34.41Z" },
2127
+ { url = "https://files.pythonhosted.org/packages/63/4b/e3678b4e776db00f9f7b2fe58e547e8928ef32727d7a1ff01dea010f3f13/zstandard-0.25.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8e735494da3db08694d26480f1493ad2cf86e99bdd53e8e9771b2752a5c0246a", size = 5547173, upload-time = "2025-09-14T22:17:36.084Z" },
2128
+ { url = "https://files.pythonhosted.org/packages/4e/d5/ba05ed95c6b8ec30bd468dfeab20589f2cf709b5c940483e31d991f2ca58/zstandard-0.25.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3a39c94ad7866160a4a46d772e43311a743c316942037671beb264e395bdd611", size = 5046736, upload-time = "2025-09-14T22:17:37.891Z" },
2129
+ { url = "https://files.pythonhosted.org/packages/50/d5/870aa06b3a76c73eced65c044b92286a3c4e00554005ff51962deef28e28/zstandard-0.25.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:172de1f06947577d3a3005416977cce6168f2261284c02080e7ad0185faeced3", size = 5576368, upload-time = "2025-09-14T22:17:40.206Z" },
2130
+ { url = "https://files.pythonhosted.org/packages/5d/35/398dc2ffc89d304d59bc12f0fdd931b4ce455bddf7038a0a67733a25f550/zstandard-0.25.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:3c83b0188c852a47cd13ef3bf9209fb0a77fa5374958b8c53aaa699398c6bd7b", size = 4954022, upload-time = "2025-09-14T22:17:41.879Z" },
2131
+ { url = "https://files.pythonhosted.org/packages/9a/5c/36ba1e5507d56d2213202ec2b05e8541734af5f2ce378c5d1ceaf4d88dc4/zstandard-0.25.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:1673b7199bbe763365b81a4f3252b8e80f44c9e323fc42940dc8843bfeaf9851", size = 5267889, upload-time = "2025-09-14T22:17:43.577Z" },
2132
+ { url = "https://files.pythonhosted.org/packages/70/e8/2ec6b6fb7358b2ec0113ae202647ca7c0e9d15b61c005ae5225ad0995df5/zstandard-0.25.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:0be7622c37c183406f3dbf0cba104118eb16a4ea7359eeb5752f0794882fc250", size = 5433952, upload-time = "2025-09-14T22:17:45.271Z" },
2133
+ { url = "https://files.pythonhosted.org/packages/7b/01/b5f4d4dbc59ef193e870495c6f1275f5b2928e01ff5a81fecb22a06e22fb/zstandard-0.25.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:5f5e4c2a23ca271c218ac025bd7d635597048b366d6f31f420aaeb715239fc98", size = 5814054, upload-time = "2025-09-14T22:17:47.08Z" },
2134
+ { url = "https://files.pythonhosted.org/packages/b2/e5/fbd822d5c6f427cf158316d012c5a12f233473c2f9c5fe5ab1ae5d21f3d8/zstandard-0.25.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4f187a0bb61b35119d1926aee039524d1f93aaf38a9916b8c4b78ac8514a0aaf", size = 5360113, upload-time = "2025-09-14T22:17:48.893Z" },
2135
+ { url = "https://files.pythonhosted.org/packages/8e/e0/69a553d2047f9a2c7347caa225bb3a63b6d7704ad74610cb7823baa08ed7/zstandard-0.25.0-cp313-cp313-win32.whl", hash = "sha256:7030defa83eef3e51ff26f0b7bfb229f0204b66fe18e04359ce3474ac33cbc09", size = 436936, upload-time = "2025-09-14T22:17:52.658Z" },
2136
+ { url = "https://files.pythonhosted.org/packages/d9/82/b9c06c870f3bd8767c201f1edbdf9e8dc34be5b0fbc5682c4f80fe948475/zstandard-0.25.0-cp313-cp313-win_amd64.whl", hash = "sha256:1f830a0dac88719af0ae43b8b2d6aef487d437036468ef3c2ea59c51f9d55fd5", size = 506232, upload-time = "2025-09-14T22:17:50.402Z" },
2137
+ { url = "https://files.pythonhosted.org/packages/d4/57/60c3c01243bb81d381c9916e2a6d9e149ab8627c0c7d7abb2d73384b3c0c/zstandard-0.25.0-cp313-cp313-win_arm64.whl", hash = "sha256:85304a43f4d513f5464ceb938aa02c1e78c2943b29f44a750b48b25ac999a049", size = 462671, upload-time = "2025-09-14T22:17:51.533Z" },
2138
+ ]
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@flowajs/chat-service",
3
- "version": "0.0.3",
3
+ "version": "0.0.4",
4
4
  "description": "Stateless service that orchestrates LLM conversations over flowa artifacts.",
5
5
  "license": "MIT",
6
6
  "type": "module",
@@ -71,7 +71,7 @@
71
71
  "zod": "4.4.3"
72
72
  },
73
73
  "peerDependencies": {
74
- "@ai-sdk/amazon-bedrock": "^4.0.0",
74
+ "@ai-sdk/amazon-bedrock": "^4.0.101",
75
75
  "@ai-sdk/anthropic": "^3.0.0",
76
76
  "@ai-sdk/google": "^3.0.0",
77
77
  "@ai-sdk/google-vertex": "^4.0.0",
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "flowapy"
3
- version = "0.1.3"
3
+ version = "0.2.0"
4
4
  description = "Variant literature assessment pipeline with AI extraction"
5
5
  readme = "README.md"
6
6
  requires-python = "==3.13.*"
@@ -27,6 +27,7 @@ dependencies = [
27
27
  "s3fs", # S3/MinIO support for fsspec
28
28
  "tenacity",
29
29
  "typer",
30
+ "zstandard",
30
31
  ]
31
32
 
32
33
  [project.optional-dependencies]
@@ -15,7 +15,7 @@ from pydantic_ai import Agent, ModelRetry, NativeOutput, RunContext
15
15
  from flowa.clinvar import format_clinvar_for_prompt, query_clinvar
16
16
  from flowa.models import create_model, get_model_settings
17
17
  from flowa.prompts import load_prompt_and_schema
18
- from flowa.resolve import CitationQuery, resolve_citations
18
+ from flowa.resolve import CitationQuery, load_pdf_index_from_storage, resolve_citations
19
19
  from flowa.schema import AGGREGATION_SCHEMA_VERSION, with_schema_version
20
20
  from flowa.settings import ModelConfig, Settings
21
21
  from flowa.storage import (
@@ -23,9 +23,7 @@ from flowa.storage import (
23
23
  encode_doi,
24
24
  exists,
25
25
  paper_url,
26
- read_bytes,
27
26
  read_json,
28
- read_text,
29
27
  write_bytes,
30
28
  write_json,
31
29
  )
@@ -158,16 +156,14 @@ def create_aggregate_agent(
158
156
  def resolve_aggregate_citations(
159
157
  aggregate_dict: dict[str, Any],
160
158
  paper_id_to_doi: dict[str, str],
161
- pdf_bytes_cache: dict[str, bytes],
162
- markdown_cache: dict[str, str],
159
+ base: str,
163
160
  metadata_cache: dict[str, dict[str, Any]],
164
161
  ) -> None:
165
162
  """Post-process aggregate output: resolve quotes to bboxes on claim citations.
166
163
 
167
- Delegates the actual alignment to `flowa.resolve.resolve_citations` with
168
- cache-backed loaders, then attaches the resulting bboxes onto each claim's
169
- citations in place. Claims are grouped by paper_id so every (paper_id,
170
- quote) pair resolves to exactly one paper.
164
+ Loads each paper's pre-built `pdf_index.pkl.zst` via the same path the
165
+ gateway uses; the convert step that ran earlier in this pipeline wrote
166
+ the artifact, so it's guaranteed to be present.
171
167
  """
172
168
  # Collect all (doi, quote) pairs, grouped by DOI.
173
169
  doi_quotes: dict[str, list[str]] = {}
@@ -180,8 +176,7 @@ def resolve_aggregate_citations(
180
176
  citations_input = [CitationQuery(doi=doi, quotes=quotes) for doi, quotes in doi_quotes.items()]
181
177
  result = resolve_citations(
182
178
  citations_input,
183
- pdf_loader=pdf_bytes_cache.get,
184
- markdown_loader=markdown_cache.get,
179
+ index_provider=lambda doi: load_pdf_index_from_storage(base, doi),
185
180
  )
186
181
 
187
182
  # Attach resolved bboxes onto each claim's citations.
@@ -229,14 +224,11 @@ async def aggregate_evidence_async(
229
224
  clinvar_data = query_clinvar(hgvs_c_full, ncbi_api_key)
230
225
  clinvar_text = format_clinvar_for_prompt(clinvar_data)
231
226
 
232
- # Load extractions and metadata for each paper. PDF bytes and Markdown are
233
- # cached for post-LLM citation resolution: PdfIndex takes both (markdown
234
- # denoises the indexed PDF chars). Both files are produced together by the
235
- # pipeline; a missing markdown.md at this point is a storage corruption and
236
- # surfaces as FileNotFoundError below.
227
+ # Load extractions and metadata for each paper. PDF bytes and markdown
228
+ # are NOT loaded here — the post-LLM citation resolver loads the paper's
229
+ # pre-built `pdf_index.pkl.zst` directly from storage, so this step only
230
+ # needs the LLM inputs.
237
231
  evidence_extractions: list[dict[str, Any]] = []
238
- pdf_bytes_cache: dict[str, bytes] = {}
239
- markdown_cache: dict[str, str] = {}
240
232
  metadata_cache: dict[str, dict[str, Any]] = {}
241
233
 
242
234
  for doi in dois:
@@ -252,8 +244,6 @@ async def aggregate_evidence_async(
252
244
  log.info('Skipping %s: variant not discussed', doi)
253
245
  continue
254
246
 
255
- pdf_bytes_cache[doi] = read_bytes(paper_url(base, doi, 'source.pdf'))
256
- markdown_cache[doi] = read_text(paper_url(base, doi, 'markdown.md'))
257
247
  metadata = read_json(paper_url(base, doi, 'metadata.json'))
258
248
  metadata_cache[doi] = metadata
259
249
 
@@ -332,7 +322,7 @@ async def aggregate_evidence_async(
332
322
  # Post-LLM: resolve quotes to bboxes, replace paper_id with DOI
333
323
  aggregate_dict = output.model_dump()
334
324
  with logfire.span('flowa.resolve_citations', paper_count=len(paper_id_to_doi)):
335
- resolve_aggregate_citations(aggregate_dict, paper_id_to_doi, pdf_bytes_cache, markdown_cache, metadata_cache)
325
+ resolve_aggregate_citations(aggregate_dict, paper_id_to_doi, base, metadata_cache)
336
326
 
337
327
  # Store structured aggregation result
338
328
  write_json(aggregation_url, with_schema_version(aggregate_dict, AGGREGATION_SCHEMA_VERSION))
@@ -17,9 +17,11 @@ from pydantic_ai import Agent
17
17
  from pydantic_ai.messages import BinaryContent
18
18
 
19
19
  from flowa.models import create_model, get_model_settings
20
+ from flowa.pdf_index_cache import build as build_pdf_index_payload
21
+ from flowa.pdf_index_cache import serialize as serialize_pdf_index_payload
20
22
  from flowa.prompts import load_text_prompt
21
23
  from flowa.settings import ModelConfig, Settings
22
- from flowa.storage import exists, paper_url, read_bytes, write_bytes, write_text
24
+ from flowa.storage import exists, paper_url, read_bytes, read_text, write_bytes, write_text
23
25
 
24
26
  log = logging.getLogger(__name__)
25
27
 
@@ -120,14 +122,22 @@ async def transcribe(
120
122
 
121
123
 
122
124
  async def convert_paper_async(base: str, doi: str, model: ModelConfig, prompt_set: str = 'generic') -> None:
123
- """Convert a single paper's PDF to Markdown.
125
+ """Convert a single paper's PDF to Markdown and persist its `PdfIndex`.
124
126
 
125
- Reads PDF from papers/{encoded_doi}/source.pdf in object storage.
126
- Stores result to papers/{encoded_doi}/markdown.md.
127
+ Reads PDF from papers/{encoded_doi}/source.pdf and writes
128
+ papers/{encoded_doi}/markdown.md plus papers/{encoded_doi}/pdf_index.pkl.zst
129
+ (consumed by the gateway's resolve endpoint).
130
+
131
+ Either artifact can be missing independently — if a previous run failed
132
+ or pre-dates the pdf_index step, the next call fills in only what's
133
+ missing without re-transcribing or re-building work that's already done.
127
134
  """
128
135
  md_url = paper_url(base, doi, 'markdown.md')
136
+ index_url = paper_url(base, doi, 'pdf_index.pkl.zst')
137
+ md_needed = not exists(md_url)
138
+ index_needed = not exists(index_url)
129
139
 
130
- if exists(md_url):
140
+ if not md_needed and not index_needed:
131
141
  log.info('Already converted: %s', md_url)
132
142
  return
133
143
 
@@ -138,21 +148,36 @@ async def convert_paper_async(base: str, doi: str, model: ModelConfig, prompt_se
138
148
  log.info('Skipping DOI %s: PDF not available', doi)
139
149
  return
140
150
 
141
- log.info(
142
- 'Converting DOI %s (%d bytes, model: %s, chunk: %d pages)', doi, len(pdf_bytes), model.name, PAGES_PER_CHUNK
143
- )
144
-
145
- prompt = load_text_prompt('transcription', prompt_set)
146
- t0 = time.monotonic()
147
- result = await transcribe(pdf_bytes, model=model, prompt=prompt, page_count=PAGES_PER_CHUNK)
148
- elapsed = time.monotonic() - t0
149
-
150
- write_text(md_url, result.markdown)
151
-
152
- raw_url = paper_url(base, doi, 'convert_raw.json')
153
- write_bytes(raw_url, json.dumps(result.all_messages).encode())
154
-
155
- log.info('Converted DOI %s: %d chars in %.1fs', doi, len(result.markdown), elapsed)
151
+ markdown: str | None = None
152
+ if md_needed:
153
+ log.info(
154
+ 'Converting DOI %s (%d bytes, model: %s, chunk: %d pages)', doi, len(pdf_bytes), model.name, PAGES_PER_CHUNK
155
+ )
156
+ prompt = load_text_prompt('transcription', prompt_set)
157
+ t0 = time.monotonic()
158
+ result = await transcribe(pdf_bytes, model=model, prompt=prompt, page_count=PAGES_PER_CHUNK)
159
+ elapsed = time.monotonic() - t0
160
+
161
+ markdown = result.markdown
162
+ write_text(md_url, markdown)
163
+ write_bytes(paper_url(base, doi, 'convert_raw.json'), json.dumps(result.all_messages).encode())
164
+ log.info('Converted DOI %s: %d chars in %.1fs', doi, len(markdown), elapsed)
165
+
166
+ if index_needed:
167
+ # PdfIndex construction is CPU-bound (~8s on the deployed gateway
168
+ # hardware) and dominates per-call latency at `/api/v1/resolve` if
169
+ # rebuilt on every call. Pay the cost here once per paper and ship
170
+ # the result; see `flowa.pdf_index_cache` for the storage format.
171
+ # `asyncio.to_thread` keeps the rest of the convert pipeline (other
172
+ # papers being transcribed concurrently) unblocked.
173
+ if markdown is None: # index missing but markdown already on disk
174
+ markdown = read_text(md_url)
175
+ t0 = time.monotonic()
176
+ blob = await asyncio.to_thread(
177
+ lambda: serialize_pdf_index_payload(build_pdf_index_payload(pdf_bytes, markdown))
178
+ )
179
+ write_bytes(index_url, blob)
180
+ log.info('Wrote pdf_index for DOI %s: %.1f MB in %.1fs', doi, len(blob) / 1e6, time.monotonic() - t0)
156
181
 
157
182
 
158
183
  def convert_paper(
@@ -0,0 +1,133 @@
1
+ """Build, serialise, and load cached `PdfIndex` artifacts.
2
+
3
+ The gateway's per-call cost is dominated by `PdfIndex(pdf_bytes)` construction
4
+ — ~8s on the deployed gateway hardware for a typical paper, against ~300ms
5
+ for the actual quote alignment afterwards. To avoid paying that on every
6
+ `/api/v1/resolve` call, the pipeline persists the constructed index at
7
+ `papers/{encoded_doi}/pdf_index.pkl.zst` so the gateway can load instead of
8
+ rebuild.
9
+
10
+ On-the-wire format: zstd-compressed pickle of a single dict:
11
+
12
+ {
13
+ "format_version": int, # bumped when this module changes what it serialises
14
+ "source_pdf_sha256": str, # sha256 hex digest of source.pdf bytes
15
+ "pdf_index": PdfIndex,
16
+ }
17
+
18
+ The header fields let `deserialize` reject artifacts that are out of sync
19
+ with the current source.pdf or with the current cache format — a stale
20
+ pickle would silently produce wrong bboxes. The format version is *our*
21
+ contract over the persisted shape: bump it when this module changes which
22
+ fields it stores or how it stores them. It is not tied to anchorite's
23
+ release cadence — anchorite patch/minor releases that preserve the pickle
24
+ shape of `PdfIndex` deserialise fine.
25
+
26
+ zstd level 3 was chosen empirically (see specs/supplements.md): ~5x faster
27
+ compression than gzip -6 at comparable ratio (~1/5 of the pickled size),
28
+ and a touch faster decompression. The pipeline pays the compress cost once
29
+ per paper; the gateway pays the decompress cost on every cold load, so
30
+ optimising for both directions matters.
31
+
32
+ Anchorite documents that `PdfIndex` pickles cleanly (state is str/bytes/
33
+ list[int]/frozen dataclasses); no custom reducers needed.
34
+ """
35
+
36
+ from __future__ import annotations
37
+
38
+ import hashlib
39
+ import pickle
40
+ from dataclasses import dataclass
41
+
42
+ import zstandard
43
+ from anchorite import PdfIndex
44
+
45
+ # Bump when the persisted shape changes: new fields, removed fields, semantic
46
+ # meaning changes. Anchorite version bumps that preserve `PdfIndex` pickle
47
+ # compatibility do NOT require a bump here — those deserialise correctly
48
+ # under the existing format. If a new anchorite release changes `PdfIndex`'s
49
+ # internals such that old pickles still load but produce different bboxes,
50
+ # bump this to force a re-backfill.
51
+ FORMAT_VERSION = 1
52
+
53
+ ZSTD_LEVEL = 3
54
+ PICKLE_PROTOCOL = pickle.HIGHEST_PROTOCOL
55
+
56
+
57
+ class StaleIndexError(Exception):
58
+ """Persisted artifact's header doesn't match the runtime expectation.
59
+
60
+ Raised by `deserialize` when the pickle was written under a different
61
+ `FORMAT_VERSION` or against a source.pdf with a different sha256. The
62
+ caller decides whether to rebuild or surface an error — this module
63
+ deliberately doesn't fall back, because silent rebuild masks pipeline
64
+ drift.
65
+ """
66
+
67
+
68
+ @dataclass(frozen=True)
69
+ class PdfIndexPayload:
70
+ """In-memory view of the persisted artifact."""
71
+
72
+ format_version: int
73
+ source_pdf_sha256: str
74
+ pdf_index: PdfIndex
75
+
76
+
77
+ def build(pdf_bytes: bytes, markdown: str) -> PdfIndexPayload:
78
+ """Construct a PdfIndex pinned to its source PDF by sha256.
79
+
80
+ `markdown` is the paper's transcription, used by anchorite to denoise the
81
+ indexed PDF char string (drop running heads, page numbers, footnote
82
+ markers the LLM didn't transcribe), which improves quote alignment.
83
+ """
84
+ # markdown is threaded through but not yet forwarded to PdfIndex: the
85
+ # anchorite-#19 markdown denoise drops entire pages of atoms when the
86
+ # markdown reorders content relative to PDF page order. Switch to
87
+ # `markdown=markdown` once the upstream fix lands — and bump
88
+ # FORMAT_VERSION + re-backfill, since the denoised index resolves quotes
89
+ # against a different cached char string (existing pickles would silently
90
+ # produce different bboxes).
91
+ return PdfIndexPayload(
92
+ format_version=FORMAT_VERSION,
93
+ source_pdf_sha256=hashlib.sha256(pdf_bytes).hexdigest(),
94
+ pdf_index=PdfIndex(pdf_bytes, markdown=None),
95
+ )
96
+
97
+
98
+ def serialize(payload: PdfIndexPayload) -> bytes:
99
+ """Pickle + zstd-compress for upload to S3."""
100
+ pkl = pickle.dumps(
101
+ {
102
+ 'format_version': payload.format_version,
103
+ 'source_pdf_sha256': payload.source_pdf_sha256,
104
+ 'pdf_index': payload.pdf_index,
105
+ },
106
+ protocol=PICKLE_PROTOCOL,
107
+ )
108
+ return zstandard.ZstdCompressor(level=ZSTD_LEVEL).compress(pkl)
109
+
110
+
111
+ def deserialize(blob: bytes, *, expected_pdf_sha256: str | None = None) -> PdfIndexPayload:
112
+ """Decompress + unpickle. Verifies the header before returning.
113
+
114
+ The runtime `FORMAT_VERSION` is always checked. The `source_pdf_sha256`
115
+ check is only performed when the caller supplies a value to compare
116
+ against — the gateway typically skips it (would require fetching source.pdf
117
+ just to hash it). Callers that already have the source bytes pass the
118
+ digest in to catch pipeline drift.
119
+ """
120
+ pkl = zstandard.ZstdDecompressor().decompress(blob)
121
+ raw = pickle.loads(pkl)
122
+ payload = PdfIndexPayload(
123
+ format_version=raw['format_version'],
124
+ source_pdf_sha256=raw['source_pdf_sha256'],
125
+ pdf_index=raw['pdf_index'],
126
+ )
127
+ if payload.format_version != FORMAT_VERSION:
128
+ raise StaleIndexError(f'format version mismatch: pickle={payload.format_version!r} runtime={FORMAT_VERSION!r}')
129
+ if expected_pdf_sha256 is not None and payload.source_pdf_sha256 != expected_pdf_sha256:
130
+ raise StaleIndexError(
131
+ f'source.pdf hash mismatch: pickle={payload.source_pdf_sha256!r} actual={expected_pdf_sha256!r}'
132
+ )
133
+ return payload