@mantra-ai/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ai/google/client.d.ts +67 -0
- package/dist/ai/google/client.d.ts.map +1 -0
- package/dist/ai/google/client.js +169 -0
- package/dist/ai/google/client.js.map +1 -0
- package/dist/ai/google/generate.d.ts +10 -0
- package/dist/ai/google/generate.d.ts.map +1 -0
- package/dist/ai/google/generate.js +137 -0
- package/dist/ai/google/generate.js.map +1 -0
- package/dist/ai/google/index.d.ts +4 -0
- package/dist/ai/google/index.d.ts.map +1 -0
- package/dist/ai/google/index.js +4 -0
- package/dist/ai/google/index.js.map +1 -0
- package/dist/ai/google/types.d.ts +88 -0
- package/dist/ai/google/types.d.ts.map +1 -0
- package/dist/ai/google/types.js +55 -0
- package/dist/ai/google/types.js.map +1 -0
- package/dist/ai/index.d.ts +3 -0
- package/dist/ai/index.d.ts.map +1 -0
- package/dist/ai/index.js +3 -0
- package/dist/ai/index.js.map +1 -0
- package/dist/ai/openai/client.d.ts +22 -0
- package/dist/ai/openai/client.d.ts.map +1 -0
- package/dist/ai/openai/client.js +49 -0
- package/dist/ai/openai/client.js.map +1 -0
- package/dist/ai/openai/generate.d.ts +14 -0
- package/dist/ai/openai/generate.d.ts.map +1 -0
- package/dist/ai/openai/generate.js +178 -0
- package/dist/ai/openai/generate.js.map +1 -0
- package/dist/ai/openai/index.d.ts +4 -0
- package/dist/ai/openai/index.d.ts.map +1 -0
- package/dist/ai/openai/index.js +4 -0
- package/dist/ai/openai/index.js.map +1 -0
- package/dist/ai/openai/types.d.ts +86 -0
- package/dist/ai/openai/types.d.ts.map +1 -0
- package/dist/ai/openai/types.js +56 -0
- package/dist/ai/openai/types.js.map +1 -0
- package/dist/ai/prompts/index.d.ts +1 -0
- package/dist/ai/prompts/index.d.ts.map +1 -0
- package/dist/ai/prompts/index.js +2 -0
- package/dist/ai/prompts/index.js.map +1 -0
- package/dist/errors/index.d.ts +3 -0
- package/dist/errors/index.d.ts.map +1 -0
- package/dist/errors/index.js +4 -0
- package/dist/errors/index.js.map +1 -0
- package/dist/errors/schemas.d.ts +304 -0
- package/dist/errors/schemas.d.ts.map +1 -0
- package/dist/errors/schemas.js +57 -0
- package/dist/errors/schemas.js.map +1 -0
- package/dist/errors/types.d.ts +30 -0
- package/dist/errors/types.d.ts.map +1 -0
- package/dist/errors/types.js +33 -0
- package/dist/errors/types.js.map +1 -0
- package/dist/index.d.ts +21 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +21 -0
- package/dist/index.js.map +1 -0
- package/dist/normalization/jats/index.d.ts +4 -0
- package/dist/normalization/jats/index.d.ts.map +1 -0
- package/dist/normalization/jats/index.js +3 -0
- package/dist/normalization/jats/index.js.map +1 -0
- package/dist/normalization/jats/normalize.d.ts +7 -0
- package/dist/normalization/jats/normalize.d.ts.map +1 -0
- package/dist/normalization/jats/normalize.js +213 -0
- package/dist/normalization/jats/normalize.js.map +1 -0
- package/dist/normalization/jats/utils/build/finalize.d.ts +3 -0
- package/dist/normalization/jats/utils/build/finalize.d.ts.map +1 -0
- package/dist/normalization/jats/utils/build/finalize.js +462 -0
- package/dist/normalization/jats/utils/build/finalize.js.map +1 -0
- package/dist/normalization/jats/utils/build/flatten.d.ts +20 -0
- package/dist/normalization/jats/utils/build/flatten.d.ts.map +1 -0
- package/dist/normalization/jats/utils/build/flatten.js +502 -0
- package/dist/normalization/jats/utils/build/flatten.js.map +1 -0
- package/dist/normalization/jats/utils/build/meta.d.ts +10 -0
- package/dist/normalization/jats/utils/build/meta.d.ts.map +1 -0
- package/dist/normalization/jats/utils/build/meta.js +32 -0
- package/dist/normalization/jats/utils/build/meta.js.map +1 -0
- package/dist/normalization/jats/utils/build/version.d.ts +3 -0
- package/dist/normalization/jats/utils/build/version.d.ts.map +1 -0
- package/dist/normalization/jats/utils/build/version.js +11 -0
- package/dist/normalization/jats/utils/build/version.js.map +1 -0
- package/dist/normalization/jats/utils/category.d.ts +11 -0
- package/dist/normalization/jats/utils/category.d.ts.map +1 -0
- package/dist/normalization/jats/utils/category.js +431 -0
- package/dist/normalization/jats/utils/category.js.map +1 -0
- package/dist/normalization/jats/utils/collectors/abstracts.d.ts +3 -0
- package/dist/normalization/jats/utils/collectors/abstracts.d.ts.map +1 -0
- package/dist/normalization/jats/utils/collectors/abstracts.js +168 -0
- package/dist/normalization/jats/utils/collectors/abstracts.js.map +1 -0
- package/dist/normalization/jats/utils/collectors/back.d.ts +35 -0
- package/dist/normalization/jats/utils/collectors/back.d.ts.map +1 -0
- package/dist/normalization/jats/utils/collectors/back.js +801 -0
- package/dist/normalization/jats/utils/collectors/back.js.map +1 -0
- package/dist/normalization/jats/utils/collectors/contributors.d.ts +4 -0
- package/dist/normalization/jats/utils/collectors/contributors.d.ts.map +1 -0
- package/dist/normalization/jats/utils/collectors/contributors.js +77 -0
- package/dist/normalization/jats/utils/collectors/contributors.js.map +1 -0
- package/dist/normalization/jats/utils/collectors/keywords.d.ts +2 -0
- package/dist/normalization/jats/utils/collectors/keywords.d.ts.map +1 -0
- package/dist/normalization/jats/utils/collectors/keywords.js +14 -0
- package/dist/normalization/jats/utils/collectors/keywords.js.map +1 -0
- package/dist/normalization/jats/utils/collectors/meta.d.ts +6 -0
- package/dist/normalization/jats/utils/collectors/meta.d.ts.map +1 -0
- package/dist/normalization/jats/utils/collectors/meta.js +103 -0
- package/dist/normalization/jats/utils/collectors/meta.js.map +1 -0
- package/dist/normalization/jats/utils/collectors/sections.d.ts +7 -0
- package/dist/normalization/jats/utils/collectors/sections.d.ts.map +1 -0
- package/dist/normalization/jats/utils/collectors/sections.js +484 -0
- package/dist/normalization/jats/utils/collectors/sections.js.map +1 -0
- package/dist/normalization/jats/utils/licenses.d.ts +5 -0
- package/dist/normalization/jats/utils/licenses.d.ts.map +1 -0
- package/dist/normalization/jats/utils/licenses.js +64 -0
- package/dist/normalization/jats/utils/licenses.js.map +1 -0
- package/dist/normalization/jats/utils/po/nodes.d.ts +6 -0
- package/dist/normalization/jats/utils/po/nodes.d.ts.map +1 -0
- package/dist/normalization/jats/utils/po/nodes.js +60 -0
- package/dist/normalization/jats/utils/po/nodes.js.map +1 -0
- package/dist/normalization/jats/utils/po/query.d.ts +7 -0
- package/dist/normalization/jats/utils/po/query.d.ts.map +1 -0
- package/dist/normalization/jats/utils/po/query.js +67 -0
- package/dist/normalization/jats/utils/po/query.js.map +1 -0
- package/dist/normalization/jats/utils/po/serialize.d.ts +4 -0
- package/dist/normalization/jats/utils/po/serialize.d.ts.map +1 -0
- package/dist/normalization/jats/utils/po/serialize.js +329 -0
- package/dist/normalization/jats/utils/po/serialize.js.map +1 -0
- package/dist/normalization/jats/utils/po/text.d.ts +7 -0
- package/dist/normalization/jats/utils/po/text.d.ts.map +1 -0
- package/dist/normalization/jats/utils/po/text.js +114 -0
- package/dist/normalization/jats/utils/po/text.js.map +1 -0
- package/dist/normalization/jats/utils/references.d.ts +26 -0
- package/dist/normalization/jats/utils/references.d.ts.map +1 -0
- package/dist/normalization/jats/utils/references.js +371 -0
- package/dist/normalization/jats/utils/references.js.map +1 -0
- package/dist/normalization/jats/utils/strings.d.ts +8 -0
- package/dist/normalization/jats/utils/strings.d.ts.map +1 -0
- package/dist/normalization/jats/utils/strings.js +197 -0
- package/dist/normalization/jats/utils/strings.js.map +1 -0
- package/dist/normalization/jats/utils/types.d.ts +233 -0
- package/dist/normalization/jats/utils/types.d.ts.map +1 -0
- package/dist/normalization/jats/utils/types.js +2 -0
- package/dist/normalization/jats/utils/types.js.map +1 -0
- package/dist/normalization/jats/utils/xml.d.ts +5 -0
- package/dist/normalization/jats/utils/xml.d.ts.map +1 -0
- package/dist/normalization/jats/utils/xml.js +69 -0
- package/dist/normalization/jats/utils/xml.js.map +1 -0
- package/dist/normalization/normalized-doc-schema.d.ts +1094 -0
- package/dist/normalization/normalized-doc-schema.d.ts.map +1 -0
- package/dist/normalization/normalized-doc-schema.js +410 -0
- package/dist/normalization/normalized-doc-schema.js.map +1 -0
- package/dist/normalization/pdf/index.d.ts +4 -0
- package/dist/normalization/pdf/index.d.ts.map +1 -0
- package/dist/normalization/pdf/index.js +3 -0
- package/dist/normalization/pdf/index.js.map +1 -0
- package/dist/normalization/pdf/normalize.d.ts +31 -0
- package/dist/normalization/pdf/normalize.d.ts.map +1 -0
- package/dist/normalization/pdf/normalize.js +321 -0
- package/dist/normalization/pdf/normalize.js.map +1 -0
- package/dist/normalization/pdf/prompt.d.ts +3 -0
- package/dist/normalization/pdf/prompt.d.ts.map +1 -0
- package/dist/normalization/pdf/prompt.js +118 -0
- package/dist/normalization/pdf/prompt.js.map +1 -0
- package/dist/sources/arxiv/client.d.ts +4 -0
- package/dist/sources/arxiv/client.d.ts.map +1 -0
- package/dist/sources/arxiv/client.js +13 -0
- package/dist/sources/arxiv/client.js.map +1 -0
- package/dist/sources/biorxiv/client.d.ts +21 -0
- package/dist/sources/biorxiv/client.d.ts.map +1 -0
- package/dist/sources/biorxiv/client.js +173 -0
- package/dist/sources/biorxiv/client.js.map +1 -0
- package/dist/sources/crossref/client.d.ts +3 -0
- package/dist/sources/crossref/client.d.ts.map +1 -0
- package/dist/sources/crossref/client.js +24 -0
- package/dist/sources/crossref/client.js.map +1 -0
- package/dist/sources/europepmc/client.d.ts +3 -0
- package/dist/sources/europepmc/client.d.ts.map +1 -0
- package/dist/sources/europepmc/client.js +29 -0
- package/dist/sources/europepmc/client.js.map +1 -0
- package/dist/sources/medrxiv/browser.d.ts +16 -0
- package/dist/sources/medrxiv/browser.d.ts.map +1 -0
- package/dist/sources/medrxiv/browser.js +210 -0
- package/dist/sources/medrxiv/browser.js.map +1 -0
- package/dist/sources/medrxiv/client.d.ts +34 -0
- package/dist/sources/medrxiv/client.d.ts.map +1 -0
- package/dist/sources/medrxiv/client.js +673 -0
- package/dist/sources/medrxiv/client.js.map +1 -0
- package/dist/sources/medrxiv/shared.d.ts +7 -0
- package/dist/sources/medrxiv/shared.d.ts.map +1 -0
- package/dist/sources/medrxiv/shared.js +18 -0
- package/dist/sources/medrxiv/shared.js.map +1 -0
- package/dist/sources/plos/client.d.ts +13 -0
- package/dist/sources/plos/client.d.ts.map +1 -0
- package/dist/sources/plos/client.js +147 -0
- package/dist/sources/plos/client.js.map +1 -0
- package/dist/sources/preprint-discovery.d.ts +55 -0
- package/dist/sources/preprint-discovery.d.ts.map +1 -0
- package/dist/sources/preprint-discovery.js +115 -0
- package/dist/sources/preprint-discovery.js.map +1 -0
- package/dist/types/expand.d.ts +5 -0
- package/dist/types/expand.d.ts.map +1 -0
- package/dist/types/expand.js +20 -0
- package/dist/types/expand.js.map +1 -0
- package/dist/types/methods-types.d.ts +37 -0
- package/dist/types/methods-types.d.ts.map +1 -0
- package/dist/types/methods-types.js +2 -0
- package/dist/types/methods-types.js.map +1 -0
- package/dist/types/multi-input-types.d.ts +57 -0
- package/dist/types/multi-input-types.d.ts.map +1 -0
- package/dist/types/multi-input-types.js +2 -0
- package/dist/types/multi-input-types.js.map +1 -0
- package/dist/types/paper/types.d.ts +41 -0
- package/dist/types/paper/types.d.ts.map +1 -0
- package/dist/types/paper/types.js +2 -0
- package/dist/types/paper/types.js.map +1 -0
- package/dist/types/results-types.d.ts +122 -0
- package/dist/types/results-types.d.ts.map +1 -0
- package/dist/types/results-types.js +17 -0
- package/dist/types/results-types.js.map +1 -0
- package/dist/types/supp-types.d.ts +6 -0
- package/dist/types/supp-types.d.ts.map +1 -0
- package/dist/types/supp-types.js +2 -0
- package/dist/types/supp-types.js.map +1 -0
- package/dist/types/version.d.ts +1828 -0
- package/dist/types/version.d.ts.map +1 -0
- package/dist/types/version.js +311 -0
- package/dist/types/version.js.map +1 -0
- package/dist/types/work.d.ts +4455 -0
- package/dist/types/work.d.ts.map +1 -0
- package/dist/types/work.js +330 -0
- package/dist/types/work.js.map +1 -0
- package/dist/works/adapters/crossref.d.ts +28 -0
- package/dist/works/adapters/crossref.d.ts.map +1 -0
- package/dist/works/adapters/crossref.js +43 -0
- package/dist/works/adapters/crossref.js.map +1 -0
- package/dist/works/adapters/europepmc.d.ts +14 -0
- package/dist/works/adapters/europepmc.d.ts.map +1 -0
- package/dist/works/adapters/europepmc.js +46 -0
- package/dist/works/adapters/europepmc.js.map +1 -0
- package/dist/works/adapters/openalex.d.ts +5 -0
- package/dist/works/adapters/openalex.d.ts.map +1 -0
- package/dist/works/adapters/openalex.js +75 -0
- package/dist/works/adapters/openalex.js.map +1 -0
- package/dist/works/errors.d.ts +23 -0
- package/dist/works/errors.d.ts.map +1 -0
- package/dist/works/errors.js +37 -0
- package/dist/works/errors.js.map +1 -0
- package/dist/works/id/detect-identifier.d.ts +15 -0
- package/dist/works/id/detect-identifier.d.ts.map +1 -0
- package/dist/works/id/detect-identifier.js +50 -0
- package/dist/works/id/detect-identifier.js.map +1 -0
- package/dist/works/id/normalize-external-id.d.ts +3 -0
- package/dist/works/id/normalize-external-id.d.ts.map +1 -0
- package/dist/works/id/normalize-external-id.js +44 -0
- package/dist/works/id/normalize-external-id.js.map +1 -0
- package/dist/works/id/normalize-ids.d.ts +66 -0
- package/dist/works/id/normalize-ids.d.ts.map +1 -0
- package/dist/works/id/normalize-ids.js +112 -0
- package/dist/works/id/normalize-ids.js.map +1 -0
- package/dist/works/id/normalize-internals.d.ts +7 -0
- package/dist/works/id/normalize-internals.d.ts.map +1 -0
- package/dist/works/id/normalize-internals.js +65 -0
- package/dist/works/id/normalize-internals.js.map +1 -0
- package/dist/works/id/resolve.d.ts +31 -0
- package/dist/works/id/resolve.d.ts.map +1 -0
- package/dist/works/id/resolve.js +123 -0
- package/dist/works/id/resolve.js.map +1 -0
- package/dist/works/id/resolveIds/assign.d.ts +4 -0
- package/dist/works/id/resolveIds/assign.d.ts.map +1 -0
- package/dist/works/id/resolveIds/assign.js +15 -0
- package/dist/works/id/resolveIds/assign.js.map +1 -0
- package/dist/works/id/resolveIds/flags.d.ts +11 -0
- package/dist/works/id/resolveIds/flags.d.ts.map +1 -0
- package/dist/works/id/resolveIds/flags.js +27 -0
- package/dist/works/id/resolveIds/flags.js.map +1 -0
- package/dist/works/id/resolveIds/idctx.d.ts +4 -0
- package/dist/works/id/resolveIds/idctx.d.ts.map +1 -0
- package/dist/works/id/resolveIds/idctx.js +25 -0
- package/dist/works/id/resolveIds/idctx.js.map +1 -0
- package/dist/works/id/resolveIds/index.d.ts +13 -0
- package/dist/works/id/resolveIds/index.d.ts.map +1 -0
- package/dist/works/id/resolveIds/index.js +498 -0
- package/dist/works/id/resolveIds/index.js.map +1 -0
- package/dist/works/id/resolveIds/versioning.d.ts +27 -0
- package/dist/works/id/resolveIds/versioning.d.ts.map +1 -0
- package/dist/works/id/resolveIds/versioning.js +156 -0
- package/dist/works/id/resolveIds/versioning.js.map +1 -0
- package/dist/works/id/resolveIds/workWhere.d.ts +3 -0
- package/dist/works/id/resolveIds/workWhere.d.ts.map +1 -0
- package/dist/works/id/resolveIds/workWhere.js +35 -0
- package/dist/works/id/resolveIds/workWhere.js.map +1 -0
- package/dist/works/id/types.d.ts +6 -0
- package/dist/works/id/types.d.ts.map +1 -0
- package/dist/works/id/types.js +2 -0
- package/dist/works/id/types.js.map +1 -0
- package/dist/works/pdf-fallback/candidates.d.ts +12 -0
- package/dist/works/pdf-fallback/candidates.d.ts.map +1 -0
- package/dist/works/pdf-fallback/candidates.js +51 -0
- package/dist/works/pdf-fallback/candidates.js.map +1 -0
- package/dist/works/pdf-fallback/fetch.d.ts +21 -0
- package/dist/works/pdf-fallback/fetch.d.ts.map +1 -0
- package/dist/works/pdf-fallback/fetch.js +89 -0
- package/dist/works/pdf-fallback/fetch.js.map +1 -0
- package/dist/works/pdf-fallback/index.d.ts +28 -0
- package/dist/works/pdf-fallback/index.d.ts.map +1 -0
- package/dist/works/pdf-fallback/index.js +35 -0
- package/dist/works/pdf-fallback/index.js.map +1 -0
- package/dist/works/plan.d.ts +8 -0
- package/dist/works/plan.d.ts.map +1 -0
- package/dist/works/plan.js +62 -0
- package/dist/works/plan.js.map +1 -0
- package/dist/works/strategies/arxiv.d.ts +3 -0
- package/dist/works/strategies/arxiv.d.ts.map +1 -0
- package/dist/works/strategies/arxiv.js +56 -0
- package/dist/works/strategies/arxiv.js.map +1 -0
- package/dist/works/strategies/biorxiv.d.ts +3 -0
- package/dist/works/strategies/biorxiv.d.ts.map +1 -0
- package/dist/works/strategies/biorxiv.js +63 -0
- package/dist/works/strategies/biorxiv.js.map +1 -0
- package/dist/works/strategies/europepmc.d.ts +3 -0
- package/dist/works/strategies/europepmc.d.ts.map +1 -0
- package/dist/works/strategies/europepmc.js +15 -0
- package/dist/works/strategies/europepmc.js.map +1 -0
- package/dist/works/strategies/index.d.ts +12 -0
- package/dist/works/strategies/index.d.ts.map +1 -0
- package/dist/works/strategies/index.js +19 -0
- package/dist/works/strategies/index.js.map +1 -0
- package/dist/works/strategies/landing-url.d.ts +3 -0
- package/dist/works/strategies/landing-url.d.ts.map +1 -0
- package/dist/works/strategies/landing-url.js +10 -0
- package/dist/works/strategies/landing-url.js.map +1 -0
- package/dist/works/strategies/medrxiv.d.ts +3 -0
- package/dist/works/strategies/medrxiv.d.ts.map +1 -0
- package/dist/works/strategies/medrxiv.js +47 -0
- package/dist/works/strategies/medrxiv.js.map +1 -0
- package/dist/works/strategies/plos.d.ts +3 -0
- package/dist/works/strategies/plos.d.ts.map +1 -0
- package/dist/works/strategies/plos.js +15 -0
- package/dist/works/strategies/plos.js.map +1 -0
- package/dist/works/strategies/shared.d.ts +11 -0
- package/dist/works/strategies/shared.d.ts.map +1 -0
- package/dist/works/strategies/shared.js +97 -0
- package/dist/works/strategies/shared.js.map +1 -0
- package/dist/works/strategies/ten1101.d.ts +3 -0
- package/dist/works/strategies/ten1101.d.ts.map +1 -0
- package/dist/works/strategies/ten1101.js +84 -0
- package/dist/works/strategies/ten1101.js.map +1 -0
- package/dist/works/text/acquire-fulltext.d.ts +7 -0
- package/dist/works/text/acquire-fulltext.d.ts.map +1 -0
- package/dist/works/text/acquire-fulltext.js +62 -0
- package/dist/works/text/acquire-fulltext.js.map +1 -0
- package/dist/works/text/normalize.d.ts +40 -0
- package/dist/works/text/normalize.d.ts.map +1 -0
- package/dist/works/text/normalize.js +188 -0
- package/dist/works/text/normalize.js.map +1 -0
- package/dist/works/types.d.ts +215 -0
- package/dist/works/types.d.ts.map +1 -0
- package/dist/works/types.js +6 -0
- package/dist/works/types.js.map +1 -0
- package/dist/works/util/debug.d.ts +7 -0
- package/dist/works/util/debug.d.ts.map +1 -0
- package/dist/works/util/debug.js +9 -0
- package/dist/works/util/debug.js.map +1 -0
- package/dist/works/util/license.d.ts +9 -0
- package/dist/works/util/license.d.ts.map +1 -0
- package/dist/works/util/license.js +39 -0
- package/dist/works/util/license.js.map +1 -0
- package/dist/works/util/normalize.d.ts +2 -0
- package/dist/works/util/normalize.d.ts.map +1 -0
- package/dist/works/util/normalize.js +76 -0
- package/dist/works/util/normalize.js.map +1 -0
- package/dist/works/util/parse.d.ts +8 -0
- package/dist/works/util/parse.d.ts.map +1 -0
- package/dist/works/util/parse.js +32 -0
- package/dist/works/util/parse.js.map +1 -0
- package/dist/works/util/source.d.ts +10 -0
- package/dist/works/util/source.d.ts.map +1 -0
- package/dist/works/util/source.js +48 -0
- package/dist/works/util/source.js.map +1 -0
- package/dist/works/util/version-label.d.ts +2 -0
- package/dist/works/util/version-label.d.ts.map +1 -0
- package/dist/works/util/version-label.js +8 -0
- package/dist/works/util/version-label.js.map +1 -0
- package/dist/works/util/work-id.d.ts +2 -0
- package/dist/works/util/work-id.d.ts.map +1 -0
- package/dist/works/util/work-id.js +27 -0
- package/dist/works/util/work-id.js.map +1 -0
- package/package.json +208 -0
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import { generateContentWithPdf } from "../../ai/google/client";
|
|
3
|
+
import { NormalizedDocSchema, } from "../normalized-doc-schema";
|
|
4
|
+
import { normalizeDOI, normalizePMCID } from "../../works/id/normalize-ids";
|
|
5
|
+
import { applyNormalizedCompleteIds } from "../../works/id/resolveIds/assign";
|
|
6
|
+
import { PipelineError, PIPELINE_ERROR_CODES, } from "../../works/errors";
|
|
7
|
+
import { PDF_EXTRACTION_PROMPT, PDF_EXTRACTION_SYSTEM } from "./prompt";
|
|
8
|
+
function extractFirstJson(text, truncated = false) {
|
|
9
|
+
const trimmed = text.trim();
|
|
10
|
+
try {
|
|
11
|
+
return JSON.parse(trimmed);
|
|
12
|
+
}
|
|
13
|
+
catch { }
|
|
14
|
+
// Strip markdown fencing if present
|
|
15
|
+
const unfenced = trimmed
|
|
16
|
+
.replace(/^```(?:json)?\s*/i, "")
|
|
17
|
+
.replace(/\s*```$/i, "")
|
|
18
|
+
.trim();
|
|
19
|
+
try {
|
|
20
|
+
return JSON.parse(unfenced);
|
|
21
|
+
}
|
|
22
|
+
catch { }
|
|
23
|
+
// Find the first { and try to parse from there
|
|
24
|
+
const objStart = unfenced.indexOf("{");
|
|
25
|
+
if (objStart === -1)
|
|
26
|
+
return null;
|
|
27
|
+
const candidate = unfenced.slice(objStart);
|
|
28
|
+
try {
|
|
29
|
+
return JSON.parse(candidate);
|
|
30
|
+
}
|
|
31
|
+
catch { }
|
|
32
|
+
// If truncated, attempt to repair by closing open structures
|
|
33
|
+
if (truncated) {
|
|
34
|
+
const repaired = repairTruncatedJson(candidate);
|
|
35
|
+
if (repaired)
|
|
36
|
+
return repaired;
|
|
37
|
+
}
|
|
38
|
+
return null;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Attempt to repair JSON truncated mid-stream by closing open structures
|
|
42
|
+
* in the correct nesting order using a stack.
|
|
43
|
+
*/
|
|
44
|
+
function repairTruncatedJson(text) {
|
|
45
|
+
// Strip trailing partial value from the end (work backwards to avoid
|
|
46
|
+
// expensive regex on 200KB+ strings)
|
|
47
|
+
let end = text.length;
|
|
48
|
+
// Walk back past whitespace
|
|
49
|
+
while (end > 0 && " \t\n\r".includes(text[end - 1]))
|
|
50
|
+
end--;
|
|
51
|
+
// If we're mid-string (odd number of unescaped quotes at tail), trim to last clean point
|
|
52
|
+
// If trailing comma, strip it
|
|
53
|
+
let cleaned = text.slice(0, end);
|
|
54
|
+
cleaned = cleaned.replace(/,\s*$/, "");
|
|
55
|
+
// Track nesting with a stack to close in correct order
|
|
56
|
+
const stack = [];
|
|
57
|
+
let inString = false;
|
|
58
|
+
let escaped = false;
|
|
59
|
+
for (const ch of cleaned) {
|
|
60
|
+
if (escaped) {
|
|
61
|
+
escaped = false;
|
|
62
|
+
continue;
|
|
63
|
+
}
|
|
64
|
+
if (ch === "\\") {
|
|
65
|
+
escaped = true;
|
|
66
|
+
continue;
|
|
67
|
+
}
|
|
68
|
+
if (ch === '"') {
|
|
69
|
+
inString = !inString;
|
|
70
|
+
continue;
|
|
71
|
+
}
|
|
72
|
+
if (inString)
|
|
73
|
+
continue;
|
|
74
|
+
if (ch === "{")
|
|
75
|
+
stack.push("}");
|
|
76
|
+
else if (ch === "[")
|
|
77
|
+
stack.push("]");
|
|
78
|
+
else if (ch === "}" || ch === "]")
|
|
79
|
+
stack.pop();
|
|
80
|
+
}
|
|
81
|
+
// If we ended inside a string, close it
|
|
82
|
+
if (inString)
|
|
83
|
+
cleaned += '"';
|
|
84
|
+
// If the last complete token was a key with no value (e.g. `"surname":`),
|
|
85
|
+
// add a null so the object entry is valid
|
|
86
|
+
if (/:\s*$/.test(cleaned))
|
|
87
|
+
cleaned += "null";
|
|
88
|
+
// Close open structures in reverse nesting order
|
|
89
|
+
while (stack.length > 0)
|
|
90
|
+
cleaned += stack.pop();
|
|
91
|
+
try {
|
|
92
|
+
return JSON.parse(cleaned);
|
|
93
|
+
}
|
|
94
|
+
catch { }
|
|
95
|
+
// If that didn't work, try progressively trimming trailing partial entries
|
|
96
|
+
// by removing the last line repeatedly (up to 20 lines)
|
|
97
|
+
let fallback = cleaned;
|
|
98
|
+
for (let i = 0; i < 20; i++) {
|
|
99
|
+
const lastNewline = fallback.lastIndexOf("\n");
|
|
100
|
+
if (lastNewline === -1)
|
|
101
|
+
break;
|
|
102
|
+
fallback = fallback.slice(0, lastNewline).replace(/,\s*$/, "");
|
|
103
|
+
// Recount stack
|
|
104
|
+
const st = [];
|
|
105
|
+
let inStr = false;
|
|
106
|
+
let esc = false;
|
|
107
|
+
for (const ch of fallback) {
|
|
108
|
+
if (esc) {
|
|
109
|
+
esc = false;
|
|
110
|
+
continue;
|
|
111
|
+
}
|
|
112
|
+
if (ch === "\\") {
|
|
113
|
+
esc = true;
|
|
114
|
+
continue;
|
|
115
|
+
}
|
|
116
|
+
if (ch === '"') {
|
|
117
|
+
inStr = !inStr;
|
|
118
|
+
continue;
|
|
119
|
+
}
|
|
120
|
+
if (inStr)
|
|
121
|
+
continue;
|
|
122
|
+
if (ch === "{")
|
|
123
|
+
st.push("}");
|
|
124
|
+
else if (ch === "[")
|
|
125
|
+
st.push("]");
|
|
126
|
+
else if (ch === "}" || ch === "]")
|
|
127
|
+
st.pop();
|
|
128
|
+
}
|
|
129
|
+
if (inStr)
|
|
130
|
+
fallback += '"';
|
|
131
|
+
if (/:\s*$/.test(fallback))
|
|
132
|
+
fallback += "null";
|
|
133
|
+
let attempt = fallback;
|
|
134
|
+
while (st.length > 0)
|
|
135
|
+
attempt += st.pop();
|
|
136
|
+
try {
|
|
137
|
+
return JSON.parse(attempt);
|
|
138
|
+
}
|
|
139
|
+
catch { }
|
|
140
|
+
}
|
|
141
|
+
return null;
|
|
142
|
+
}
|
|
143
|
+
/**
|
|
144
|
+
* Recursively strip null values from a parsed JSON object.
|
|
145
|
+
* Gemini often returns `null` for missing fields instead of omitting the key,
|
|
146
|
+
* which causes Zod `.optional()` fields to fail validation.
|
|
147
|
+
*/
|
|
148
|
+
function stripNulls(obj) {
|
|
149
|
+
if (obj === null)
|
|
150
|
+
return undefined;
|
|
151
|
+
if (Array.isArray(obj))
|
|
152
|
+
return obj.map(stripNulls);
|
|
153
|
+
if (typeof obj === "object") {
|
|
154
|
+
const out = {};
|
|
155
|
+
for (const [k, v] of Object.entries(obj)) {
|
|
156
|
+
if (v !== null)
|
|
157
|
+
out[k] = stripNulls(v);
|
|
158
|
+
}
|
|
159
|
+
return out;
|
|
160
|
+
}
|
|
161
|
+
return obj;
|
|
162
|
+
}
|
|
163
|
+
const RETRY_DELAY_MS = 500;
|
|
164
|
+
const PRIMARY_MODEL = "gemini-2.5-flash-lite";
|
|
165
|
+
const FALLBACK_CHAIN = [
|
|
166
|
+
{ strategy: "inline", model: PRIMARY_MODEL, label: "inline/2.5-flash-lite" },
|
|
167
|
+
{ strategy: "inline", model: PRIMARY_MODEL, label: "inline/2.5-flash-lite/no-json-mime", jsonMime: false },
|
|
168
|
+
{ strategy: "file_upload", model: PRIMARY_MODEL, label: "file_upload/2.5-flash-lite" },
|
|
169
|
+
];
|
|
170
|
+
export async function normalizePdfToDoc(input) {
|
|
171
|
+
const { pdfBuffer, completeIds, logger, cid } = input;
|
|
172
|
+
const pdfBase64 = pdfBuffer.toString("base64");
|
|
173
|
+
const pdfSha256 = createHash("sha256").update(pdfBuffer).digest("hex");
|
|
174
|
+
let lastError;
|
|
175
|
+
let lastCode;
|
|
176
|
+
for (let i = 0; i < FALLBACK_CHAIN.length; i++) {
|
|
177
|
+
const step = FALLBACK_CHAIN[i];
|
|
178
|
+
const res = await generateContentWithPdf({
|
|
179
|
+
pdfBase64,
|
|
180
|
+
pdfBuffer,
|
|
181
|
+
prompt: PDF_EXTRACTION_PROMPT,
|
|
182
|
+
systemInstruction: PDF_EXTRACTION_SYSTEM,
|
|
183
|
+
model: step.model,
|
|
184
|
+
strategy: step.strategy,
|
|
185
|
+
temperature: 0.1,
|
|
186
|
+
maxOutputTokens: 65536,
|
|
187
|
+
...(step.jsonMime !== false && { responseMimeType: "application/json" }),
|
|
188
|
+
});
|
|
189
|
+
if (!res.ok) {
|
|
190
|
+
lastError = res.error;
|
|
191
|
+
lastCode = res.code;
|
|
192
|
+
logger?.warn?.({
|
|
193
|
+
correlationId: cid,
|
|
194
|
+
step: step.label,
|
|
195
|
+
stepIndex: i,
|
|
196
|
+
code: res.code,
|
|
197
|
+
finishReason: res.meta?.finishReason,
|
|
198
|
+
error: res.error,
|
|
199
|
+
}, "pdf.normalize.gemini_error");
|
|
200
|
+
// For RECITATION, escalate to next fallback step immediately
|
|
201
|
+
if (res.code === "recitation") {
|
|
202
|
+
if (i < FALLBACK_CHAIN.length - 1) {
|
|
203
|
+
await new Promise((r) => setTimeout(r, RETRY_DELAY_MS));
|
|
204
|
+
continue;
|
|
205
|
+
}
|
|
206
|
+
break;
|
|
207
|
+
}
|
|
208
|
+
// For other errors, don't retry (API errors, safety blocks, etc.)
|
|
209
|
+
break;
|
|
210
|
+
}
|
|
211
|
+
const rawText = res.text;
|
|
212
|
+
const isTruncated = res.meta.finishReason === "MAX_TOKENS";
|
|
213
|
+
if (isTruncated) {
|
|
214
|
+
logger?.warn?.({ correlationId: cid, step: step.label, textLength: rawText.length }, "pdf.normalize.truncated_response");
|
|
215
|
+
}
|
|
216
|
+
const parsed = extractFirstJson(rawText, isTruncated);
|
|
217
|
+
if (!parsed) {
|
|
218
|
+
lastError = "Could not extract JSON from Gemini response";
|
|
219
|
+
lastCode = "json_extraction_failed";
|
|
220
|
+
logger?.warn?.({
|
|
221
|
+
correlationId: cid,
|
|
222
|
+
step: step.label,
|
|
223
|
+
stepIndex: i,
|
|
224
|
+
rawTextLength: rawText.length,
|
|
225
|
+
}, "pdf.normalize.json_extraction_failed");
|
|
226
|
+
// JSON parse failure is not a RECITATION issue — don't escalate strategies
|
|
227
|
+
break;
|
|
228
|
+
}
|
|
229
|
+
const validated = NormalizedDocSchema.safeParse(stripNulls(parsed));
|
|
230
|
+
if (!validated.success) {
|
|
231
|
+
lastError = `Schema validation failed: ${JSON.stringify(validated.error.flatten())}`;
|
|
232
|
+
lastCode = "schema_validation_failed";
|
|
233
|
+
logger?.warn?.({
|
|
234
|
+
correlationId: cid,
|
|
235
|
+
step: step.label,
|
|
236
|
+
stepIndex: i,
|
|
237
|
+
errors: validated.error.flatten(),
|
|
238
|
+
}, "pdf.normalize.schema_validation_failed");
|
|
239
|
+
// Schema failure is not a RECITATION issue — don't escalate strategies
|
|
240
|
+
break;
|
|
241
|
+
}
|
|
242
|
+
const normalizedDoc = validated.data;
|
|
243
|
+
const meta = normalizedDoc.metadata;
|
|
244
|
+
// Merge IDs from extracted metadata into completeIds (non-destructive)
|
|
245
|
+
const metaIds = meta?.ids ?? {};
|
|
246
|
+
const mergeIfEmpty = (key, value) => {
|
|
247
|
+
if (!value)
|
|
248
|
+
return;
|
|
249
|
+
if (completeIds[key])
|
|
250
|
+
return;
|
|
251
|
+
completeIds[key] = value;
|
|
252
|
+
};
|
|
253
|
+
if (metaIds.doi)
|
|
254
|
+
mergeIfEmpty("doi", normalizeDOI(metaIds.doi));
|
|
255
|
+
if (metaIds.pmcid)
|
|
256
|
+
mergeIfEmpty("pmcid", normalizePMCID(metaIds.pmcid));
|
|
257
|
+
applyNormalizedCompleteIds(completeIds);
|
|
258
|
+
// Extract convenience fields
|
|
259
|
+
const title = meta?.title ?? undefined;
|
|
260
|
+
const abstractText = meta?.abstract?.paragraphs?.join(" ") ?? undefined;
|
|
261
|
+
const licenseFromMeta = meta?.license?.href ?? meta?.license?.type ?? null;
|
|
262
|
+
const versionLabel = completeIds.version ?? undefined;
|
|
263
|
+
logger?.info?.({
|
|
264
|
+
correlationId: cid,
|
|
265
|
+
source: "pdf-gemini",
|
|
266
|
+
step: step.label,
|
|
267
|
+
haveTitle: Boolean(title),
|
|
268
|
+
haveAbstract: Boolean(abstractText),
|
|
269
|
+
sectionCount: normalizedDoc.sections?.length ?? 0,
|
|
270
|
+
blockCount: normalizedDoc.blocks?.length ?? 0,
|
|
271
|
+
citationCount: normalizedDoc.citations?.length ?? 0,
|
|
272
|
+
versionLabel,
|
|
273
|
+
}, "pdf.normalize.success");
|
|
274
|
+
return {
|
|
275
|
+
isPdf: true,
|
|
276
|
+
versionLabel,
|
|
277
|
+
normalizedJson: normalizedDoc,
|
|
278
|
+
metadata: meta || null,
|
|
279
|
+
schemaVersion: normalizedDoc.schemaVersion ?? null,
|
|
280
|
+
title,
|
|
281
|
+
abstractText,
|
|
282
|
+
licenseFromMeta,
|
|
283
|
+
license: null,
|
|
284
|
+
source: "pdf-gemini",
|
|
285
|
+
sourceUrl: null,
|
|
286
|
+
rawXml: null,
|
|
287
|
+
pdfSha256,
|
|
288
|
+
blocks: normalizedDoc.blocks,
|
|
289
|
+
sections: normalizedDoc.sections,
|
|
290
|
+
citations: normalizedDoc.citations,
|
|
291
|
+
assets: normalizedDoc.assets,
|
|
292
|
+
};
|
|
293
|
+
}
|
|
294
|
+
const isRecitationExhausted = lastCode === "recitation";
|
|
295
|
+
throw new PipelineError(isRecitationExhausted
|
|
296
|
+
? PIPELINE_ERROR_CODES.UPSTREAM_FAILURE
|
|
297
|
+
: PIPELINE_ERROR_CODES.INTERNAL_ERROR, isRecitationExhausted
|
|
298
|
+
? `PDF normalization failed: all strategies exhausted due to Gemini RECITATION block. The model refused to extract content from this PDF across ${FALLBACK_CHAIN.length} fallback strategies.`
|
|
299
|
+
: `PDF normalization failed: ${lastError}`, {
|
|
300
|
+
failureClass: lastCode ?? "unknown",
|
|
301
|
+
strategiesAttempted: FALLBACK_CHAIN.length,
|
|
302
|
+
});
|
|
303
|
+
}
|
|
304
|
+
export function buildPdfRawEnvelope(pdfBuffer, pdfSha256) {
|
|
305
|
+
return {
|
|
306
|
+
xml: "", // no XML for PDF source
|
|
307
|
+
size_bytes: pdfBuffer.length,
|
|
308
|
+
hash_sha256: pdfSha256,
|
|
309
|
+
content_type: "application/pdf",
|
|
310
|
+
encoding: "binary",
|
|
311
|
+
source: { name: "pdf-upload" },
|
|
312
|
+
normalization: {
|
|
313
|
+
schema_version: "v1",
|
|
314
|
+
normalizer_version: "pdf-gemini-1.0",
|
|
315
|
+
input_hash: pdfSha256,
|
|
316
|
+
created_at: new Date().toISOString(),
|
|
317
|
+
warnings: [],
|
|
318
|
+
},
|
|
319
|
+
};
|
|
320
|
+
}
|
|
321
|
+
//# sourceMappingURL=normalize.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"normalize.js","sourceRoot":"","sources":["../../../src/normalization/pdf/normalize.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,sBAAsB,EAAE,MAAM,wBAAwB,CAAC;AAEhE,OAAO,EACL,mBAAmB,GAEpB,MAAM,0BAA0B,CAAC;AAGlC,OAAO,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC5E,OAAO,EAAE,0BAA0B,EAAE,MAAM,kCAAkC,CAAC;AAC9E,OAAO,EACL,aAAa,EACb,oBAAoB,GACrB,MAAM,oBAAoB,CAAC;AAC5B,OAAO,EAAE,qBAAqB,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAC;AA6BxE,SAAS,gBAAgB,CAAC,IAAY,EAAE,SAAS,GAAG,KAAK;IACvD,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAC5B,IAAI,CAAC;QACH,OAAO,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;IAC7B,CAAC;IAAC,MAAM,CAAC,CAAA,CAAC;IAEV,oCAAoC;IACpC,MAAM,QAAQ,GAAG,OAAO;SACrB,OAAO,CAAC,mBAAmB,EAAE,EAAE,CAAC;SAChC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC;SACvB,IAAI,EAAE,CAAC;IACV,IAAI,CAAC;QACH,OAAO,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;IAC9B,CAAC;IAAC,MAAM,CAAC,CAAA,CAAC;IAEV,+CAA+C;IAC/C,MAAM,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;IACvC,IAAI,QAAQ,KAAK,CAAC,CAAC;QAAE,OAAO,IAAI,CAAC;IACjC,MAAM,SAAS,GAAG,QAAQ,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;IAC3C,IAAI,CAAC;QACH,OAAO,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;IAC/B,CAAC;IAAC,MAAM,CAAC,CAAA,CAAC;IAEV,6DAA6D;IAC7D,IAAI,SAAS,EAAE,CAAC;QACd,MAAM,QAAQ,GAAG,mBAAmB,CAAC,SAAS,CAAC,CAAC;QAChD,IAAI,QAAQ;YAAE,OAAO,QAAQ,CAAC;IAChC,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;GAGG;AACH,SAAS,mBAAmB,CAAC,IAAY;IACvC,qEAAqE;IACrE,qCAAqC;IACrC,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC;IAEtB,4BAA4B;IAC5B,OAAO,GAAG,GAAG,CAAC,IAAI,SAAS,CAAC,QAAQ,CAAC,IAAI,CAAC,GAAG,GAAG,CAAC,CAAE,CAAC;QAAE,GAAG,EAAE,CAAC;IAE5D,yFAAyF;IACzF,8BAA8B;IAC9B,IAAI,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;IACjC,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;IAEvC,uDAAuD;IACvD,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,QAAQ,GAAG,KAAK,CAAC;IACrB,IAAI,OAAO,GAAG,KAAK,CAAC;IAEpB,KAAK,MAAM,EAAE,IAAI,OAAO,EAAE,CAAC;QACzB,IAAI,OAAO,EAAE,CAAC;YAAC,OAAO,GAAG,KAAK,CAAC;YAAC,SAAS;QAAC,CAAC;QAC3C,IAAI,EAAE,KAAK,IAAI,EAAE,CAAC;YAAC,OAAO,GAAG,IAAI,CAAC;YAAC,SAAS;QAAC,CAAC;QAC9C,IAAI,EAAE,KAAK,GAAG,EAAE,CAAC;YAAC,QAAQ,GAAG,CAAC,QAAQ,CAAC;YAAC,SAAS;QAAC,CAAC;QACnD,IAAI,QAAQ;YAAE,SAAS;QACvB,IAAI,EAAE,KAAK,GAAG;YAAE,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;aAC3B,IAAI,EAAE,KAAK,GAAG;YAAE,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;aAChC,IAAI,EAAE,KAAK,GAAG,IAAI,EAAE,KAAK,GAAG;YAAE,KAAK,CAAC,GAAG,EAAE,CAAC;IACjD,CAAC;IAED,wCAAwC;IACxC,IAAI,QAAQ;QAAE,OAAO,IAAI,GAAG,CAAC;IAE7B,0EAA0E;IAC1E,0CAA0C;IAC1C,IAAI,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,IAAI,MAAM,CAAC;IAE7C,iDAAiD;IACjD,OAAO,KAAK,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,IAAI,KAAK,CAAC,GAAG,EAAE,CAAC;IAEhD,IAAI,CAAC;QACH,OAAO,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;IAC7B,CAAC;IAAC,MAAM,CAAC,CAAA,CAAC;IAEV,2EAA2E;IAC3E,wDAAwD;IACxD,IAAI,QAAQ,GAAG,OAAO,CAAC;IACvB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC;QAC5B,MAAM,WAAW,GAAG,QAAQ,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;QAC/C,IAAI,WAAW,KAAK,CAAC,CAAC;YAAE,MAAM;QAC9B,QAAQ,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,WAAW,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QAE/D,gBAAgB;QAChB,MAAM,EAAE,GAAa,EAAE,CAAC;QACxB,IAAI,KAAK,GAAG,KAAK,CAAC;QAClB,IAAI,GAAG,GAAG,KAAK,CAAC;QAChB,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;YAC1B,IAAI,GAAG,EAAE,CAAC;gBAAC,GAAG,GAAG,KAAK,CAAC;gBAAC,SAAS;YAAC,CAAC;YACnC,IAAI,EAAE,KAAK,IAAI,EAAE,CAAC;gBAAC,GAAG,GAAG,IAAI,CAAC;gBAAC,SAAS;YAAC,CAAC;YAC1C,IAAI,EAAE,KAAK,GAAG,EAAE,CAAC;gBAAC,KAAK,GAAG,CAAC,KAAK,CAAC;gBAAC,SAAS;YAAC,CAAC;YAC7C,IAAI,KAAK;gBAAE,SAAS;YACpB,IAAI,EAAE,KAAK,GAAG;gBAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;iBACxB,IAAI,EAAE,KAAK,GAAG;gBAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;iBAC7B,IAAI,EAAE,KAAK,GAAG,IAAI,EAAE,KAAK,GAAG;gBAAE,EAAE,CAAC,GAAG,EAAE,CAAC;QAC9C,CAAC;QACD,IAAI,KAAK;YAAE,QAAQ,IAAI,GAAG,CAAC;QAC3B,IAAI,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC;YAAE,QAAQ,IAAI,MAAM,CAAC;QAC/C,IAAI,OAAO,GAAG,QAAQ,CAAC;QACvB,OAAO,EAAE,CAAC,MAAM,GAAG,CAAC;YAAE,OAAO,IAAI,EAAE,CAAC,GAAG,EAAE,CAAC;QAC1C,IAAI,CAAC;YACH,OAAO,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAC7B,CAAC;QAAC,MAAM,CAAC,CAAA,CAAC;IACZ,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;;GAIG;AACH,SAAS,UAAU,CAAC,GAAY;IAC9B,IAAI,GAAG,KAAK,IAAI;QAAE,OAAO,SAAS,CAAC;IACnC,IAAI,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC;QAAE,OAAO,GAAG,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;IACnD,IAAI,OAAO,GAAG,KAAK,QAAQ,EAAE,CAAC;QAC5B,MAAM,GAAG,GAA4B,EAAE,CAAC;QACxC,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAA8B,CAAC,EAAE,CAAC;YACpE,IAAI,CAAC,KAAK,IAAI;gBAAE,GAAG,CAAC,CAAC,CAAC,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;QACzC,CAAC;QACD,OAAO,GAAG,CAAC;IACb,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,MAAM,cAAc,GAAG,GAAG,CAAC;AAa3B,MAAM,aAAa,GAAG,uBAAuB,CAAC;AAE9C,MAAM,cAAc,GAAmB;IACrC,EAAE,QAAQ,EAAE,QAAQ,EAAE,KAAK,EAAE,aAAa,EAAE,KAAK,EAAE,uBAAuB,EAAE;IAC5E,EAAE,QAAQ,EAAE,QAAQ,EAAE,KAAK,EAAE,aAAa,EAAE,KAAK,EAAE,oCAAoC,EAAE,QAAQ,EAAE,KAAK,EAAE;IAC1G,EAAE,QAAQ,EAAE,aAAa,EAAE,KAAK,EAAE,aAAa,EAAE,KAAK,EAAE,4BAA4B,EAAE;CACvF,CAAC;AAEF,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,KAAwB;IAExB,MAAM,EAAE,SAAS,EAAE,WAAW,EAAE,MAAM,EAAE,GAAG,EAAE,GAAG,KAAK,CAAC;IAEtD,MAAM,SAAS,GAAG,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAC/C,MAAM,SAAS,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IAEvE,IAAI,SAA6B,CAAC;IAClC,IAAI,QAA4B,CAAC;IAEjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,cAAc,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC/C,MAAM,IAAI,GAAG,cAAc,CAAC,CAAC,CAAE,CAAC;QAEhC,MAAM,GAAG,GAAG,MAAM,sBAAsB,CAAC;YACvC,SAAS;YACT,SAAS;YACT,MAAM,EAAE,qBAAqB;YAC7B,iBAAiB,EAAE,qBAAqB;YACxC,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,QAAQ,EAAE,IAAI,CAAC,QAAQ;YACvB,WAAW,EAAE,GAAG;YAChB,eAAe,EAAE,KAAK;YACtB,GAAG,CAAC,IAAI,CAAC,QAAQ,KAAK,KAAK,IAAI,EAAE,gBAAgB,EAAE,kBAAkB,EAAE,CAAC;SACzE,CAAC,CAAC;QAEH,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;YACZ,SAAS,GAAG,GAAG,CAAC,KAAK,CAAC;YACtB,QAAQ,GAAG,GAAG,CAAC,IAAI,CAAC;YACpB,MAAM,EAAE,IAAI,EAAE,CACZ;gBACE,aAAa,EAAE,GAAG;gBAClB,IAAI,EAAE,IAAI,CAAC,KAAK;gBAChB,SAAS,EAAE,CAAC;gBACZ,IAAI,EAAE,GAAG,CAAC,IAAI;gBACd,YAAY,EAAE,GAAG,CAAC,IAAI,EAAE,YAAY;gBACpC,KAAK,EAAE,GAAG,CAAC,KAAK;aACjB,EACD,4BAA4B,CAC7B,CAAC;YAEF,6DAA6D;YAC7D,IAAI,GAAG,CAAC,IAAI,KAAK,YAAY,EAAE,CAAC;gBAC9B,IAAI,CAAC,GAAG,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAClC,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,cAAc,CAAC,CAAC,CAAC;oBACxD,SAAS;gBACX,CAAC;gBACD,MAAM;YACR,CAAC;YAED,kEAAkE;YAClE,MAAM;QACR,CAAC;QAED,MAAM,OAAO,GAAG,GAAG,CAAC,IAAI,CAAC;QACzB,MAAM,WAAW,GAAG,GAAG,CAAC,IAAI,CAAC,YAAY,KAAK,YAAY,CAAC;QAC3D,IAAI,WAAW,EAAE,CAAC;YAChB,MAAM,EAAE,IAAI,EAAE,CACZ,EAAE,aAAa,EAAE,GAAG,EAAE,IAAI,EAAE,IAAI,CAAC,KAAK,EAAE,UAAU,EAAE,OAAO,CAAC,MAAM,EAAE,EACpE,kCAAkC,CACnC,CAAC;QACJ,CAAC;QACD,MAAM,MAAM,GAAG,gBAAgB,CAAC,OAAO,EAAE,WAAW,CAAC,CAAC;QACtD,IAAI,CAAC,MAAM,EAAE,CAAC;YACZ,SAAS,GAAG,6CAA6C,CAAC;YAC1D,QAAQ,GAAG,wBAAwB,CAAC;YACpC,MAAM,EAAE,IAAI,EAAE,CACZ;gBACE,aAAa,EAAE,GAAG;gBAClB,IAAI,EAAE,IAAI,CAAC,KAAK;gBAChB,SAAS,EAAE,CAAC;gBACZ,aAAa,EAAE,OAAO,CAAC,MAAM;aAC9B,EACD,sCAAsC,CACvC,CAAC;YACF,2EAA2E;YAC3E,MAAM;QACR,CAAC;QAED,MAAM,SAAS,GAAG,mBAAmB,CAAC,SAAS,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC;QACpE,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,CAAC;YACvB,SAAS,GAAG,6BAA6B,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC,EAAE,CAAC;YACrF,QAAQ,GAAG,0BAA0B,CAAC;YACtC,MAAM,EAAE,IAAI,EAAE,CACZ;gBACE,aAAa,EAAE,GAAG;gBAClB,IAAI,EAAE,IAAI,CAAC,KAAK;gBAChB,SAAS,EAAE,CAAC;gBACZ,MAAM,EAAE,SAAS,CAAC,KAAK,CAAC,OAAO,EAAE;aAClC,EACD,wCAAwC,CACzC,CAAC;YACF,uEAAuE;YACvE,MAAM;QACR,CAAC;QAED,MAAM,aAAa,GAAG,SAAS,CAAC,IAAI,CAAC;QACrC,MAAM,IAAI,GAAG,aAAa,CAAC,QAAQ,CAAC;QAEpC,uEAAuE;QACvE,MAAM,OAAO,GAAG,IAAI,EAAE,GAAG,IAAI,EAAE,CAAC;QAChC,MAAM,YAAY,GAAG,CAAC,GAAsB,EAAE,KAAqB,EAAE,EAAE;YACrE,IAAI,CAAC,KAAK;gBAAE,OAAO;YACnB,IAAI,WAAW,CAAC,GAAG,CAAC;gBAAE,OAAO;YAC5B,WAAmB,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC;QACpC,CAAC,CAAC;QAEF,IAAI,OAAO,CAAC,GAAG;YAAE,YAAY,CAAC,KAAK,EAAE,YAAY,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC;QAChE,IAAI,OAAO,CAAC,KAAK;YAAE,YAAY,CAAC,OAAO,EAAE,cAAc,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC;QACxE,0BAA0B,CAAC,WAAW,CAAC,CAAC;QAExC,6BAA6B;QAC7B,MAAM,KAAK,GAAG,IAAI,EAAE,KAAK,IAAI,SAAS,CAAC;QACvC,MAAM,YAAY,GAAG,IAAI,EAAE,QAAQ,EAAE,UAAU,EAAE,IAAI,CAAC,GAAG,CAAC,IAAI,SAAS,CAAC;QACxE,MAAM,eAAe,GAAG,IAAI,EAAE,OAAO,EAAE,IAAI,IAAI,IAAI,EAAE,OAAO,EAAE,IAAI,IAAI,IAAI,CAAC;QAC3E,MAAM,YAAY,GAAG,WAAW,CAAC,OAAO,IAAI,SAAS,CAAC;QAEtD,MAAM,EAAE,IAAI,EAAE,CACZ;YACE,aAAa,EAAE,GAAG;YAClB,MAAM,EAAE,YAAY;YACpB,IAAI,EAAE,IAAI,CAAC,KAAK;YAChB,SAAS,EAAE,OAAO,CAAC,KAAK,CAAC;YACzB,YAAY,EAAE,OAAO,CAAC,YAAY,CAAC;YACnC,YAAY,EAAE,aAAa,CAAC,QAAQ,EAAE,MAAM,IAAI,CAAC;YACjD,UAAU,EAAE,aAAa,CAAC,MAAM,EAAE,MAAM,IAAI,CAAC;YAC7C,aAAa,EAAE,aAAa,CAAC,SAAS,EAAE,MAAM,IAAI,CAAC;YACnD,YAAY;SACb,EACD,uBAAuB,CACxB,CAAC;QAEF,OAAO;YACL,KAAK,EAAE,IAAa;YACpB,YAAY;YACZ,cAAc,EAAE,aAAa;YAC7B,QAAQ,EAAE,IAAI,IAAI,IAAI;YACtB,aAAa,EAAE,aAAa,CAAC,aAAa,IAAI,IAAI;YAClD,KAAK;YACL,YAAY;YACZ,eAAe;YACf,OAAO,EAAE,IAAI;YACb,MAAM,EAAE,YAAY;YACpB,SAAS,EAAE,IAAI;YACf,MAAM,EAAE,IAAI;YACZ,SAAS;YACT,MAAM,EAAE,aAAa,CAAC,MAAM;YAC5B,QAAQ,EAAE,aAAa,CAAC,QAAQ;YAChC,SAAS,EAAE,aAAa,CAAC,SAAS;YAClC,MAAM,EAAE,aAAa,CAAC,MAAM;SAC7B,CAAC;IACJ,CAAC;IAED,MAAM,qBAAqB,GAAG,QAAQ,KAAK,YAAY,CAAC;IACxD,MAAM,IAAI,aAAa,CACrB,qBAAqB;QACnB,CAAC,CAAC,oBAAoB,CAAC,gBAAgB;QACvC,CAAC,CAAC,oBAAoB,CAAC,cAAc,EACvC,qBAAqB;QACnB,CAAC,CAAC,gJAAgJ,cAAc,CAAC,MAAM,uBAAuB;QAC9L,CAAC,CAAC,6BAA6B,SAAS,EAAE,EAC5C;QACE,YAAY,EAAE,QAAQ,IAAI,SAAS;QACnC,mBAAmB,EAAE,cAAc,CAAC,MAAM;KAC3C,CACF,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,mBAAmB,CACjC,SAAiB,EACjB,SAAiB;IAEjB,OAAO;QACL,GAAG,EAAE,EAAE,EAAE,wBAAwB;QACjC,UAAU,EAAE,SAAS,CAAC,MAAM;QAC5B,WAAW,EAAE,SAAS;QACtB,YAAY,EAAE,iBAAiB;QAC/B,QAAQ,EAAE,QAAQ;QAClB,MAAM,EAAE,EAAE,IAAI,EAAE,YAAY,EAAE;QAC9B,aAAa,EAAE;YACb,cAAc,EAAE,IAAI;YACpB,kBAAkB,EAAE,gBAAgB;YACpC,UAAU,EAAE,SAAS;YACrB,UAAU,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACpC,QAAQ,EAAE,EAAE;SACb;KACF,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prompt.d.ts","sourceRoot":"","sources":["../../../src/normalization/pdf/prompt.ts"],"names":[],"mappings":"AAIA,eAAO,MAAM,qBAAqB,QAOvB,CAAC;AAEZ,eAAO,MAAM,qBAAqB,QA0G0D,CAAC"}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import { SectionCategoryEnum } from "../normalized-doc-schema";
|
|
2
|
+
const SECTION_CATEGORIES = SectionCategoryEnum.options.join(", ");
|
|
3
|
+
export const PDF_EXTRACTION_SYSTEM = [
|
|
4
|
+
"You are a precise scientific paper parser.",
|
|
5
|
+
"You receive a PDF of a scientific paper and extract its full content into a structured JSON object.",
|
|
6
|
+
"Respond with strictly valid JSON only.",
|
|
7
|
+
"No markdown, no code fences, no comments, no extra text.",
|
|
8
|
+
"If a field is optional and you cannot determine its value, omit the key entirely.",
|
|
9
|
+
"Do not include trailing commas.",
|
|
10
|
+
].join(" ");
|
|
11
|
+
export const PDF_EXTRACTION_PROMPT = `Extract the full content of this scientific paper PDF into a JSON object with the following structure.
|
|
12
|
+
|
|
13
|
+
## Top-level fields
|
|
14
|
+
|
|
15
|
+
- "schemaVersion": always "v1"
|
|
16
|
+
- "metadata": front-matter (see below)
|
|
17
|
+
- "sections": array of section metadata (see below)
|
|
18
|
+
- "blocks": array of content blocks (see below)
|
|
19
|
+
- "citations": array of reference entries (see below)
|
|
20
|
+
|
|
21
|
+
## metadata
|
|
22
|
+
|
|
23
|
+
An object with these optional fields:
|
|
24
|
+
- "ids": { "doi"?: string, "pmcid"?: string }
|
|
25
|
+
- "title": string (the paper title)
|
|
26
|
+
- "journal": { "title"?: string, "publisher"?: { "name"?: string } }
|
|
27
|
+
- "pub_dates": [{ "pub_type"?: string, "year"?: string, "month"?: string, "day"?: string }]
|
|
28
|
+
- "volume"?: string
|
|
29
|
+
- "issue"?: string
|
|
30
|
+
- "license": { "type"?: string, "href"?: string }
|
|
31
|
+
- "abstract": { "paragraphs": [string, ...] } (split abstract into paragraphs)
|
|
32
|
+
- "contributors": [{ "role"?: "author", "given"?: string, "surname"?: string, "full"?: string, "email"?: string, "xref_aff"?: ["aff1"] }]
|
|
33
|
+
- "affiliations": [{ "id"?: "aff1", "label"?: "1", "text"?: string, "country"?: string }]
|
|
34
|
+
- "keywords": [string, ...]
|
|
35
|
+
|
|
36
|
+
## sections
|
|
37
|
+
|
|
38
|
+
An array of objects, one per document section. Assign each section:
|
|
39
|
+
- "id": a unique slug (e.g., "sec-introduction", "sec-methods")
|
|
40
|
+
- "title": the section heading as written
|
|
41
|
+
- "normalized_title": lowercase, trimmed heading
|
|
42
|
+
- "category": one of: ${SECTION_CATEGORIES}
|
|
43
|
+
- "level": integer depth (1 = top-level section, 2 = subsection, etc.)
|
|
44
|
+
- "order": 0-based position among all sections
|
|
45
|
+
- "anchor": same as id
|
|
46
|
+
- "path_anchor": slash-separated ancestor path (e.g., "sec-methods/sec-cell-culture")
|
|
47
|
+
|
|
48
|
+
## blocks
|
|
49
|
+
|
|
50
|
+
An array of content blocks in document order. Each block is one of:
|
|
51
|
+
|
|
52
|
+
### Paragraph block
|
|
53
|
+
{
|
|
54
|
+
"kind": "paragraph",
|
|
55
|
+
"plain_text": "The full paragraph text...",
|
|
56
|
+
"section_id": "sec-introduction",
|
|
57
|
+
"path_anchor": "sec-introduction",
|
|
58
|
+
"order": 0
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
### Figure block
|
|
62
|
+
{
|
|
63
|
+
"kind": "figure",
|
|
64
|
+
"id": "fig1",
|
|
65
|
+
"label": "Figure 1",
|
|
66
|
+
"title": "Figure title if any",
|
|
67
|
+
"plain_text": "Figure caption text",
|
|
68
|
+
"section_id": "sec-results",
|
|
69
|
+
"path_anchor": "sec-results",
|
|
70
|
+
"order": 5
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
### Table block
|
|
74
|
+
{
|
|
75
|
+
"kind": "table",
|
|
76
|
+
"id": "tbl1",
|
|
77
|
+
"label": "Table 1",
|
|
78
|
+
"title": "Table title if any",
|
|
79
|
+
"plain_text": "Table caption text",
|
|
80
|
+
"section_id": "sec-results",
|
|
81
|
+
"path_anchor": "sec-results",
|
|
82
|
+
"order": 6
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
IMPORTANT for blocks:
|
|
86
|
+
- Every block MUST have "section_id" and "path_anchor" fields (strings, required).
|
|
87
|
+
- "order" should be a global 0-based index across all blocks.
|
|
88
|
+
- Extract the FULL text of every paragraph. Do not summarize or truncate.
|
|
89
|
+
|
|
90
|
+
## citations
|
|
91
|
+
|
|
92
|
+
An array of reference list entries. For each reference:
|
|
93
|
+
{
|
|
94
|
+
"id": "ref1",
|
|
95
|
+
"label": "1",
|
|
96
|
+
"text": "Full reference string as written",
|
|
97
|
+
"articleTitle"?: string,
|
|
98
|
+
"sourceTitle"?: string (journal name),
|
|
99
|
+
"year"?: number (integer),
|
|
100
|
+
"volume"?: string,
|
|
101
|
+
"issue"?: string,
|
|
102
|
+
"fpage"?: string,
|
|
103
|
+
"lpage"?: string,
|
|
104
|
+
"doi"?: string,
|
|
105
|
+
"pmid"?: string,
|
|
106
|
+
"authors"?: [{ "given"?: string, "surname"?: string }]
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
## Rules
|
|
110
|
+
|
|
111
|
+
1. Extract ALL text content from the paper. Do not skip or summarize any section.
|
|
112
|
+
2. Preserve paragraph boundaries as they appear in the paper.
|
|
113
|
+
3. For section categories, use the most specific match from the allowed list. Use "other" only as a last resort.
|
|
114
|
+
4. Number all blocks with a global "order" starting at 0.
|
|
115
|
+
5. The "section_id" on each block must match an "id" in the sections array.
|
|
116
|
+
6. Do not fabricate content. Only include what is actually in the document.
|
|
117
|
+
7. For citations, extract as much structured metadata as you can parse from each reference.`;
|
|
118
|
+
//# sourceMappingURL=prompt.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prompt.js","sourceRoot":"","sources":["../../../src/normalization/pdf/prompt.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,MAAM,0BAA0B,CAAC;AAE/D,MAAM,kBAAkB,GAAG,mBAAmB,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAElE,MAAM,CAAC,MAAM,qBAAqB,GAAG;IACnC,4CAA4C;IAC5C,qGAAqG;IACrG,wCAAwC;IACxC,0DAA0D;IAC1D,mFAAmF;IACnF,iCAAiC;CAClC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAEZ,MAAM,CAAC,MAAM,qBAAqB,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;wBA+Bb,kBAAkB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;4FA2EkD,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"client.d.ts","sourceRoot":"","sources":["../../../src/sources/arxiv/client.ts"],"names":[],"mappings":"AAAA,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,CAGxD;AAED,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,CAGxD;AAED,wBAAgB,iBAAiB,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,CAGzD"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
export function buildArxivPdfUrl(arxivId) {
|
|
2
|
+
const id = arxivId.replace(/^arxiv:/i, "");
|
|
3
|
+
return `https://arxiv.org/pdf/${id}.pdf`;
|
|
4
|
+
}
|
|
5
|
+
export function buildArxivAbsUrl(arxivId) {
|
|
6
|
+
const id = arxivId.replace(/^arxiv:/i, "");
|
|
7
|
+
return `https://arxiv.org/abs/${id}`;
|
|
8
|
+
}
|
|
9
|
+
export function buildAr5ivHtmlUrl(arxivId) {
|
|
10
|
+
const id = arxivId.replace(/^arxiv:/i, "");
|
|
11
|
+
return `https://ar5iv.org/html/${id}`;
|
|
12
|
+
}
|
|
13
|
+
//# sourceMappingURL=client.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"client.js","sourceRoot":"","sources":["../../../src/sources/arxiv/client.ts"],"names":[],"mappings":"AAAA,MAAM,UAAU,gBAAgB,CAAC,OAAe;IAC9C,MAAM,EAAE,GAAG,OAAO,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;IAC3C,OAAO,yBAAyB,EAAE,MAAM,CAAC;AAC3C,CAAC;AAED,MAAM,UAAU,gBAAgB,CAAC,OAAe;IAC9C,MAAM,EAAE,GAAG,OAAO,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;IAC3C,OAAO,yBAAyB,EAAE,EAAE,CAAC;AACvC,CAAC;AAED,MAAM,UAAU,iBAAiB,CAAC,OAAe;IAC/C,MAAM,EAAE,GAAG,OAAO,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;IAC3C,OAAO,0BAA0B,EAAE,EAAE,CAAC;AACxC,CAAC"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { type PreprintRecord } from "../preprint-discovery";
|
|
2
|
+
export type SourceFetchOptions = {
|
|
3
|
+
UA?: string;
|
|
4
|
+
preferVersion?: number | null;
|
|
5
|
+
};
|
|
6
|
+
/** Return the versioned .source.xml record for the best-matching version (or null) */
|
|
7
|
+
export declare function buildBiorxivLatestSourceXmlUrlViaApi(idOrDoi: string, opts?: SourceFetchOptions): Promise<PreprintRecord | null>;
|
|
8
|
+
/** Fetch the JATS XML content via the API-discovered URL */
|
|
9
|
+
export declare function fetchBiorxivSourceXmlViaApi(idOrDoi: string, opts?: SourceFetchOptions): Promise<{
|
|
10
|
+
url: string;
|
|
11
|
+
xml: string;
|
|
12
|
+
version?: number | null;
|
|
13
|
+
} | null>;
|
|
14
|
+
/** Build the canonical full HTML URL (versioned) from a biorxiv id/doi */
|
|
15
|
+
export declare function buildBiorxivFullHtmlUrlFromId(idOrDoi: string): string | null;
|
|
16
|
+
/** Fetch the full HTML via r.jina.ai proxy to bypass origin 403s */
|
|
17
|
+
export declare function fetchBiorxivFullHtmlViaProxy(idOrDoiWithVersion: string, UA?: string): Promise<{
|
|
18
|
+
url: string;
|
|
19
|
+
html: string;
|
|
20
|
+
} | null>;
|
|
21
|
+
//# sourceMappingURL=client.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"client.d.ts","sourceRoot":"","sources":["../../../src/sources/biorxiv/client.ts"],"names":[],"mappings":"AACA,OAAO,EAGL,KAAK,cAAc,EACpB,MAAM,uBAAuB,CAAC;AAI/B,MAAM,MAAM,kBAAkB,GAAG;IAC/B,EAAE,CAAC,EAAE,MAAM,CAAC;IACZ,aAAa,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CAC/B,CAAC;AAWF,sFAAsF;AACtF,wBAAsB,oCAAoC,CACxD,OAAO,EAAE,MAAM,EACf,IAAI,CAAC,EAAE,kBAAkB,GACxB,OAAO,CAAC,cAAc,GAAG,IAAI,CAAC,CAMhC;AAED,4DAA4D;AAC5D,wBAAsB,2BAA2B,CAC/C,OAAO,EAAE,MAAM,EACf,IAAI,CAAC,EAAE,kBAAkB,GACxB,OAAO,CAAC;IAAE,GAAG,EAAE,MAAM,CAAC;IAAC,GAAG,EAAE,MAAM,CAAC;IAAC,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,CAAA;CAAE,GAAG,IAAI,CAAC,CA6IvE;AAED,0EAA0E;AAC1E,wBAAgB,6BAA6B,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAQ5E;AAED,oEAAoE;AACpE,wBAAsB,4BAA4B,CAChD,kBAAkB,EAAE,MAAM,EAC1B,EAAE,SAA0B,GAC3B,OAAO,CAAC;IAAE,GAAG,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAA;CAAE,GAAG,IAAI,CAAC,CAkB/C"}
|