@ncukondo/search-hub 0.12.0 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_virtual/_commonjsHelpers.js +30 -0
- package/dist/_virtual/_commonjsHelpers.js.map +1 -0
- package/dist/_virtual/aliases.js +5 -0
- package/dist/_virtual/aliases.js.map +1 -0
- package/dist/_virtual/attributes.js +5 -0
- package/dist/_virtual/attributes.js.map +1 -0
- package/dist/_virtual/back.js +5 -0
- package/dist/_virtual/back.js.map +1 -0
- package/dist/_virtual/comment.js +5 -0
- package/dist/_virtual/comment.js.map +1 -0
- package/dist/_virtual/compile.js +5 -0
- package/dist/_virtual/compile.js.map +1 -0
- package/dist/_virtual/compile2.js +5 -0
- package/dist/_virtual/compile2.js.map +1 -0
- package/dist/_virtual/decode-data-html.js +5 -0
- package/dist/_virtual/decode-data-html.js.map +1 -0
- package/dist/_virtual/decode-data-xml.js +5 -0
- package/dist/_virtual/decode-data-xml.js.map +1 -0
- package/dist/_virtual/decode.js +5 -0
- package/dist/_virtual/decode.js.map +1 -0
- package/dist/_virtual/decode_codepoint.js +5 -0
- package/dist/_virtual/decode_codepoint.js.map +1 -0
- package/dist/_virtual/encode-html.js +5 -0
- package/dist/_virtual/encode-html.js.map +1 -0
- package/dist/_virtual/encode.js +5 -0
- package/dist/_virtual/encode.js.map +1 -0
- package/dist/_virtual/escape.js +5 -0
- package/dist/_virtual/escape.js.map +1 -0
- package/dist/_virtual/feeds.js +5 -0
- package/dist/_virtual/feeds.js.map +1 -0
- package/dist/_virtual/filters.js +5 -0
- package/dist/_virtual/filters.js.map +1 -0
- package/dist/_virtual/foreignNames.js +5 -0
- package/dist/_virtual/foreignNames.js.map +1 -0
- package/dist/_virtual/general.js +5 -0
- package/dist/_virtual/general.js.map +1 -0
- package/dist/_virtual/he.js +5 -0
- package/dist/_virtual/he.js.map +1 -0
- package/dist/_virtual/helpers.js +5 -0
- package/dist/_virtual/helpers.js.map +1 -0
- package/dist/_virtual/html.js +5 -0
- package/dist/_virtual/html.js.map +1 -0
- package/dist/_virtual/index.js +6 -0
- package/dist/_virtual/index.js.map +1 -0
- package/dist/_virtual/index10.js +5 -0
- package/dist/_virtual/index10.js.map +1 -0
- package/dist/_virtual/index11.js +5 -0
- package/dist/_virtual/index11.js.map +1 -0
- package/dist/_virtual/index2.js +5 -0
- package/dist/_virtual/index2.js.map +1 -0
- package/dist/_virtual/index3.js +5 -0
- package/dist/_virtual/index3.js.map +1 -0
- package/dist/_virtual/index4.js +5 -0
- package/dist/_virtual/index4.js.map +1 -0
- package/dist/_virtual/index5.js +7 -0
- package/dist/_virtual/index5.js.map +1 -0
- package/dist/_virtual/index6.js +5 -0
- package/dist/_virtual/index6.js.map +1 -0
- package/dist/_virtual/index7.js +5 -0
- package/dist/_virtual/index7.js.map +1 -0
- package/dist/_virtual/index8.js +5 -0
- package/dist/_virtual/index8.js.map +1 -0
- package/dist/_virtual/index9.js +5 -0
- package/dist/_virtual/index9.js.map +1 -0
- package/dist/_virtual/legacy.js +5 -0
- package/dist/_virtual/legacy.js.map +1 -0
- package/dist/_virtual/manipulation.js +5 -0
- package/dist/_virtual/manipulation.js.map +1 -0
- package/dist/_virtual/matcher.js +5 -0
- package/dist/_virtual/matcher.js.map +1 -0
- package/dist/_virtual/node.js +5 -0
- package/dist/_virtual/node.js.map +1 -0
- package/dist/_virtual/node2.js +5 -0
- package/dist/_virtual/node2.js.map +1 -0
- package/dist/_virtual/parse.js +5 -0
- package/dist/_virtual/parse.js.map +1 -0
- package/dist/_virtual/parse2.js +5 -0
- package/dist/_virtual/parse2.js.map +1 -0
- package/dist/_virtual/pseudos.js +5 -0
- package/dist/_virtual/pseudos.js.map +1 -0
- package/dist/_virtual/querying.js +5 -0
- package/dist/_virtual/querying.js.map +1 -0
- package/dist/_virtual/sort.js +5 -0
- package/dist/_virtual/sort.js.map +1 -0
- package/dist/_virtual/stringify.js +5 -0
- package/dist/_virtual/stringify.js.map +1 -0
- package/dist/_virtual/subselects.js +5 -0
- package/dist/_virtual/subselects.js.map +1 -0
- package/dist/_virtual/text.js +5 -0
- package/dist/_virtual/text.js.map +1 -0
- package/dist/_virtual/traversal.js +5 -0
- package/dist/_virtual/traversal.js.map +1 -0
- package/dist/_virtual/type.js +5 -0
- package/dist/_virtual/type.js.map +1 -0
- package/dist/_virtual/valid.js +5 -0
- package/dist/_virtual/valid.js.map +1 -0
- package/dist/_virtual/void-tag.js +5 -0
- package/dist/_virtual/void-tag.js.map +1 -0
- package/dist/cli/commands/fulltext/attach.js +1 -1
- package/dist/cli/commands/fulltext/attach.js.map +1 -1
- package/dist/cli/commands/fulltext/check.d.ts +1 -2
- package/dist/cli/commands/fulltext/check.d.ts.map +1 -1
- package/dist/cli/commands/fulltext/check.js +4 -2
- package/dist/cli/commands/fulltext/check.js.map +1 -1
- package/dist/cli/commands/fulltext/convert.d.ts.map +1 -1
- package/dist/cli/commands/fulltext/convert.js +8 -8
- package/dist/cli/commands/fulltext/convert.js.map +1 -1
- package/dist/cli/commands/fulltext/fetch.d.ts.map +1 -1
- package/dist/cli/commands/fulltext/fetch.js +10 -6
- package/dist/cli/commands/fulltext/fetch.js.map +1 -1
- package/dist/cli/commands/fulltext/index.d.ts.map +1 -1
- package/dist/cli/commands/fulltext/index.js +2 -0
- package/dist/cli/commands/fulltext/index.js.map +1 -1
- package/dist/cli/commands/fulltext/init.d.ts.map +1 -1
- package/dist/cli/commands/fulltext/init.js +6 -5
- package/dist/cli/commands/fulltext/init.js.map +1 -1
- package/dist/cli/commands/fulltext/pending.d.ts +1 -1
- package/dist/cli/commands/fulltext/pending.d.ts.map +1 -1
- package/dist/cli/commands/fulltext/pending.js +4 -2
- package/dist/cli/commands/fulltext/pending.js.map +1 -1
- package/dist/cli/commands/fulltext/status.d.ts.map +1 -1
- package/dist/cli/commands/fulltext/status.js +4 -2
- package/dist/cli/commands/fulltext/status.js.map +1 -1
- package/dist/cli/commands/fulltext/sync.d.ts.map +1 -1
- package/dist/cli/commands/fulltext/sync.js +6 -2
- package/dist/cli/commands/fulltext/sync.js.map +1 -1
- package/dist/cli/commands/review/types.d.ts +1 -1
- package/dist/cli/commands/review/types.d.ts.map +1 -1
- package/dist/cli/commands/review/types.js.map +1 -1
- package/dist/cli/index.d.ts.map +1 -1
- package/dist/cli/index.js +7 -0
- package/dist/cli/index.js.map +1 -1
- package/dist/config/schema.d.ts +2 -0
- package/dist/config/schema.d.ts.map +1 -1
- package/dist/config/schema.js +6 -0
- package/dist/config/schema.js.map +1 -1
- package/dist/{fulltext → integration}/attach-shared.d.ts +2 -2
- package/dist/integration/attach-shared.d.ts.map +1 -0
- package/dist/integration/attach-shared.js.map +1 -0
- package/dist/integration/fulltext-attach.js +1 -1
- package/dist/integration/fulltext-attach.js.map +1 -1
- package/dist/{fulltext → node_modules/@ncukondo/academic-fulltext/dist}/citation-key.js +1 -1
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/citation-key.js.map +1 -0
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/convert/arxiv-html-parser.js +434 -0
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/convert/arxiv-html-parser.js.map +1 -0
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/convert/index.js +93 -0
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/convert/index.js.map +1 -0
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/convert/jats-parser.js +1060 -0
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/convert/jats-parser.js.map +1 -0
- package/dist/{fulltext → node_modules/@ncukondo/academic-fulltext/dist}/convert/markdown-writer.js +146 -117
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/convert/markdown-writer.js.map +1 -0
- package/dist/{fulltext → node_modules/@ncukondo/academic-fulltext/dist}/discovery/arxiv.js +8 -1
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/discovery/arxiv.js.map +1 -0
- package/dist/{fulltext → node_modules/@ncukondo/academic-fulltext/dist}/discovery/core.js +6 -3
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/discovery/core.js.map +1 -0
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/discovery/index.js +139 -0
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/discovery/index.js.map +1 -0
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/discovery/ncbi-id-converter.js +46 -0
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/discovery/ncbi-id-converter.js.map +1 -0
- package/dist/{fulltext → node_modules/@ncukondo/academic-fulltext/dist}/discovery/pmc.js +8 -4
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/discovery/pmc.js.map +1 -0
- package/dist/{fulltext → node_modules/@ncukondo/academic-fulltext/dist}/discovery/unpaywall.js +43 -9
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/discovery/unpaywall.js.map +1 -0
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/download/arxiv-html.js +48 -0
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/download/arxiv-html.js.map +1 -0
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/download/downloader.js +64 -0
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/download/downloader.js.map +1 -0
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/download/orchestrator.js +236 -0
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/download/orchestrator.js.map +1 -0
- package/dist/{fulltext → node_modules/@ncukondo/academic-fulltext/dist}/download/pmc-xml.js +2 -1
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/download/pmc-xml.js.map +1 -0
- package/dist/{fulltext → node_modules/@ncukondo/academic-fulltext/dist}/meta.js +15 -10
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/meta.js.map +1 -0
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/paths.js.map +1 -0
- package/dist/{fulltext → node_modules/@ncukondo/academic-fulltext/dist}/readme.js +8 -4
- package/dist/node_modules/@ncukondo/academic-fulltext/dist/readme.js.map +1 -0
- package/dist/node_modules/boolbase/index.js +19 -0
- package/dist/node_modules/boolbase/index.js.map +1 -0
- package/dist/node_modules/css-select/lib/attributes.js +203 -0
- package/dist/node_modules/css-select/lib/attributes.js.map +1 -0
- package/dist/node_modules/css-select/lib/compile.js +141 -0
- package/dist/node_modules/css-select/lib/compile.js.map +1 -0
- package/dist/node_modules/css-select/lib/general.js +154 -0
- package/dist/node_modules/css-select/lib/general.js.map +1 -0
- package/dist/node_modules/css-select/lib/index.js +128 -0
- package/dist/node_modules/css-select/lib/index.js.map +1 -0
- package/dist/node_modules/css-select/lib/pseudo-selectors/aliases.js +40 -0
- package/dist/node_modules/css-select/lib/pseudo-selectors/aliases.js.map +1 -0
- package/dist/node_modules/css-select/lib/pseudo-selectors/filters.js +163 -0
- package/dist/node_modules/css-select/lib/pseudo-selectors/filters.js.map +1 -0
- package/dist/node_modules/css-select/lib/pseudo-selectors/index.js +71 -0
- package/dist/node_modules/css-select/lib/pseudo-selectors/index.js.map +1 -0
- package/dist/node_modules/css-select/lib/pseudo-selectors/pseudos.js +93 -0
- package/dist/node_modules/css-select/lib/pseudo-selectors/pseudos.js.map +1 -0
- package/dist/node_modules/css-select/lib/pseudo-selectors/subselects.js +111 -0
- package/dist/node_modules/css-select/lib/pseudo-selectors/subselects.js.map +1 -0
- package/dist/node_modules/css-select/lib/sort.js +78 -0
- package/dist/node_modules/css-select/lib/sort.js.map +1 -0
- package/dist/node_modules/css-what/lib/es/index.js +12 -0
- package/dist/node_modules/css-what/lib/es/index.js.map +1 -0
- package/dist/node_modules/css-what/lib/es/parse.js +349 -0
- package/dist/node_modules/css-what/lib/es/parse.js.map +1 -0
- package/dist/node_modules/css-what/lib/es/stringify.js +102 -0
- package/dist/node_modules/css-what/lib/es/stringify.js.map +1 -0
- package/dist/node_modules/css-what/lib/es/types.js +37 -0
- package/dist/node_modules/css-what/lib/es/types.js.map +1 -0
- package/dist/node_modules/dom-serializer/lib/foreignNames.js +117 -0
- package/dist/node_modules/dom-serializer/lib/foreignNames.js.map +1 -0
- package/dist/node_modules/dom-serializer/lib/index.js +207 -0
- package/dist/node_modules/dom-serializer/lib/index.js.map +1 -0
- package/dist/node_modules/dom-serializer/node_modules/entities/lib/decode.js +368 -0
- package/dist/node_modules/dom-serializer/node_modules/entities/lib/decode.js.map +1 -0
- package/dist/node_modules/dom-serializer/node_modules/entities/lib/decode_codepoint.js +70 -0
- package/dist/node_modules/dom-serializer/node_modules/entities/lib/decode_codepoint.js.map +1 -0
- package/dist/node_modules/dom-serializer/node_modules/entities/lib/encode.js +61 -0
- package/dist/node_modules/dom-serializer/node_modules/entities/lib/encode.js.map +1 -0
- package/dist/node_modules/dom-serializer/node_modules/entities/lib/escape.js +79 -0
- package/dist/node_modules/dom-serializer/node_modules/entities/lib/escape.js.map +1 -0
- package/dist/node_modules/dom-serializer/node_modules/entities/lib/generated/decode-data-html.js +18 -0
- package/dist/node_modules/dom-serializer/node_modules/entities/lib/generated/decode-data-html.js.map +1 -0
- package/dist/node_modules/dom-serializer/node_modules/entities/lib/generated/decode-data-xml.js +18 -0
- package/dist/node_modules/dom-serializer/node_modules/entities/lib/generated/decode-data-xml.js.map +1 -0
- package/dist/node_modules/dom-serializer/node_modules/entities/lib/generated/encode-html.js +19 -0
- package/dist/node_modules/dom-serializer/node_modules/entities/lib/generated/encode-html.js.map +1 -0
- package/dist/node_modules/dom-serializer/node_modules/entities/lib/index.js +139 -0
- package/dist/node_modules/dom-serializer/node_modules/entities/lib/index.js.map +1 -0
- package/dist/node_modules/domelementtype/lib/index.js +40 -0
- package/dist/node_modules/domelementtype/lib/index.js.map +1 -0
- package/dist/node_modules/domhandler/lib/index.js +167 -0
- package/dist/node_modules/domhandler/lib/index.js.map +1 -0
- package/dist/node_modules/domhandler/lib/node.js +439 -0
- package/dist/node_modules/domhandler/lib/node.js.map +1 -0
- package/dist/node_modules/domutils/lib/feeds.js +146 -0
- package/dist/node_modules/domutils/lib/feeds.js.map +1 -0
- package/dist/node_modules/domutils/lib/helpers.js +97 -0
- package/dist/node_modules/domutils/lib/helpers.js.map +1 -0
- package/dist/node_modules/domutils/lib/index.js +65 -0
- package/dist/node_modules/domutils/lib/index.js.map +1 -0
- package/dist/node_modules/domutils/lib/legacy.js +124 -0
- package/dist/node_modules/domutils/lib/legacy.js.map +1 -0
- package/dist/node_modules/domutils/lib/manipulation.js +107 -0
- package/dist/node_modules/domutils/lib/manipulation.js.map +1 -0
- package/dist/node_modules/domutils/lib/querying.js +102 -0
- package/dist/node_modules/domutils/lib/querying.js.map +1 -0
- package/dist/node_modules/domutils/lib/stringify.js +65 -0
- package/dist/node_modules/domutils/lib/stringify.js.map +1 -0
- package/dist/node_modules/domutils/lib/traversal.js +69 -0
- package/dist/node_modules/domutils/lib/traversal.js.map +1 -0
- package/dist/node_modules/he/he.js +256 -0
- package/dist/node_modules/he/he.js.map +1 -0
- package/dist/node_modules/node-html-parser/dist/back.js +16 -0
- package/dist/node_modules/node-html-parser/dist/back.js.map +1 -0
- package/dist/node_modules/node-html-parser/dist/index.js +48 -0
- package/dist/node_modules/node-html-parser/dist/index.js.map +1 -0
- package/dist/node_modules/node-html-parser/dist/matcher.js +112 -0
- package/dist/node_modules/node-html-parser/dist/matcher.js.map +1 -0
- package/dist/node_modules/node-html-parser/dist/nodes/comment.js +41 -0
- package/dist/node_modules/node-html-parser/dist/nodes/comment.js.map +1 -0
- package/dist/node_modules/node-html-parser/dist/nodes/html.js +1048 -0
- package/dist/node_modules/node-html-parser/dist/nodes/html.js.map +1 -0
- package/dist/node_modules/node-html-parser/dist/nodes/node.js +49 -0
- package/dist/node_modules/node-html-parser/dist/nodes/node.js.map +1 -0
- package/dist/node_modules/node-html-parser/dist/nodes/text.js +106 -0
- package/dist/node_modules/node-html-parser/dist/nodes/text.js.map +1 -0
- package/dist/node_modules/node-html-parser/dist/nodes/type.js +19 -0
- package/dist/node_modules/node-html-parser/dist/nodes/type.js.map +1 -0
- package/dist/node_modules/node-html-parser/dist/parse.js +20 -0
- package/dist/node_modules/node-html-parser/dist/parse.js.map +1 -0
- package/dist/node_modules/node-html-parser/dist/valid.js +19 -0
- package/dist/node_modules/node-html-parser/dist/valid.js.map +1 -0
- package/dist/node_modules/node-html-parser/dist/void-tag.js +36 -0
- package/dist/node_modules/node-html-parser/dist/void-tag.js.map +1 -0
- package/dist/node_modules/nth-check/lib/compile.js +76 -0
- package/dist/node_modules/nth-check/lib/compile.js.map +1 -0
- package/dist/node_modules/nth-check/lib/index.js +36 -0
- package/dist/node_modules/nth-check/lib/index.js.map +1 -0
- package/dist/node_modules/nth-check/lib/parse.js +69 -0
- package/dist/node_modules/nth-check/lib/parse.js.map +1 -0
- package/package.json +2 -2
- package/dist/fulltext/attach-shared.d.ts.map +0 -1
- package/dist/fulltext/attach-shared.js.map +0 -1
- package/dist/fulltext/citation-key.d.ts +0 -15
- package/dist/fulltext/citation-key.d.ts.map +0 -1
- package/dist/fulltext/citation-key.js.map +0 -1
- package/dist/fulltext/convert/index.d.ts +0 -20
- package/dist/fulltext/convert/index.d.ts.map +0 -1
- package/dist/fulltext/convert/index.js +0 -50
- package/dist/fulltext/convert/index.js.map +0 -1
- package/dist/fulltext/convert/jats-parser.d.ts +0 -36
- package/dist/fulltext/convert/jats-parser.d.ts.map +0 -1
- package/dist/fulltext/convert/jats-parser.js +0 -887
- package/dist/fulltext/convert/jats-parser.js.map +0 -1
- package/dist/fulltext/convert/markdown-writer.d.ts +0 -6
- package/dist/fulltext/convert/markdown-writer.d.ts.map +0 -1
- package/dist/fulltext/convert/markdown-writer.js.map +0 -1
- package/dist/fulltext/convert/types.d.ts +0 -141
- package/dist/fulltext/convert/types.d.ts.map +0 -1
- package/dist/fulltext/discovery/arxiv.d.ts +0 -11
- package/dist/fulltext/discovery/arxiv.d.ts.map +0 -1
- package/dist/fulltext/discovery/arxiv.js.map +0 -1
- package/dist/fulltext/discovery/core.d.ts +0 -11
- package/dist/fulltext/discovery/core.d.ts.map +0 -1
- package/dist/fulltext/discovery/core.js.map +0 -1
- package/dist/fulltext/discovery/index.d.ts +0 -28
- package/dist/fulltext/discovery/index.d.ts.map +0 -1
- package/dist/fulltext/discovery/index.js +0 -75
- package/dist/fulltext/discovery/index.js.map +0 -1
- package/dist/fulltext/discovery/pmc.d.ts +0 -19
- package/dist/fulltext/discovery/pmc.d.ts.map +0 -1
- package/dist/fulltext/discovery/pmc.js.map +0 -1
- package/dist/fulltext/discovery/unpaywall.d.ts +0 -11
- package/dist/fulltext/discovery/unpaywall.d.ts.map +0 -1
- package/dist/fulltext/discovery/unpaywall.js.map +0 -1
- package/dist/fulltext/download/downloader.d.ts +0 -21
- package/dist/fulltext/download/downloader.d.ts.map +0 -1
- package/dist/fulltext/download/downloader.js +0 -59
- package/dist/fulltext/download/downloader.js.map +0 -1
- package/dist/fulltext/download/orchestrator.d.ts +0 -33
- package/dist/fulltext/download/orchestrator.d.ts.map +0 -1
- package/dist/fulltext/download/orchestrator.js +0 -125
- package/dist/fulltext/download/orchestrator.js.map +0 -1
- package/dist/fulltext/download/pmc-xml.d.ts +0 -13
- package/dist/fulltext/download/pmc-xml.d.ts.map +0 -1
- package/dist/fulltext/download/pmc-xml.js.map +0 -1
- package/dist/fulltext/meta.d.ts +0 -25
- package/dist/fulltext/meta.d.ts.map +0 -1
- package/dist/fulltext/meta.js.map +0 -1
- package/dist/fulltext/paths.d.ts +0 -12
- package/dist/fulltext/paths.d.ts.map +0 -1
- package/dist/fulltext/paths.js.map +0 -1
- package/dist/fulltext/readme.d.ts +0 -4
- package/dist/fulltext/readme.d.ts.map +0 -1
- package/dist/fulltext/readme.js.map +0 -1
- package/dist/fulltext/types.d.ts +0 -90
- package/dist/fulltext/types.d.ts.map +0 -1
- /package/dist/{fulltext → integration}/attach-shared.js +0 -0
- /package/dist/{fulltext → node_modules/@ncukondo/academic-fulltext/dist}/paths.js +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"arxiv-html-parser.js","sources":["../../../../../../node_modules/@ncukondo/academic-fulltext/dist/convert/arxiv-html-parser.js"],"sourcesContent":["/**\n * arXiv HTML to JatsDocument parser.\n *\n * Parses LaTeXML-generated HTML from arXiv and converts it to the intermediate\n * JatsDocument representation, which can then be rendered to Markdown via writeMarkdown().\n */\nimport { parse as parseHtml } from \"node-html-parser\";\n// ---------------------------------------------------------------------------\n// Helpers\n// ---------------------------------------------------------------------------\n/** Query a single element, returning null if not found. */\nfunction q(root, selector) {\n return root.querySelector(selector);\n}\n/** Query all matching elements. */\nfunction qa(root, selector) {\n return root.querySelectorAll(selector);\n}\n/** Check if an element has a given CSS class. */\nfunction hasClass(el, cls) {\n return el.classList.contains(cls);\n}\nfunction escapeRegex(str) {\n return str.replace(/[.*+?^${}()|[\\]\\\\]/g, \"\\\\$&\");\n}\n/** Clone an element and remove <annotation> elements, then return textContent. */\nfunction stripAnnotations(el) {\n const clone = parseHtml(el.outerHTML);\n for (const ann of qa(clone, \"annotation\")) {\n ann.remove();\n }\n return clone.textContent.trim();\n}\n// ---------------------------------------------------------------------------\n// Inline content parsing — class-based handlers\n// ---------------------------------------------------------------------------\n/** Try matching a class-based inline element. Returns null if no match. */\nfunction matchClassInline(el) {\n if (hasClass(el, \"ltx_font_bold\")) {\n return { type: \"bold\", children: parseInlineChildren(el) };\n }\n if (hasClass(el, \"ltx_font_italic\")) {\n return { type: \"italic\", children: parseInlineChildren(el) };\n }\n if (hasClass(el, \"ltx_font_typewriter\")) {\n return { type: \"code\", text: el.textContent.trim() };\n }\n return null;\n}\nfunction handleMath(el) {\n const alt = el.getAttribute(\"alttext\");\n const text = stripAnnotations(el);\n const result = { type: \"inline-formula\", text };\n if (alt)\n result.tex = alt;\n return result;\n}\nfunction handleAnchor(el) {\n const href = el.getAttribute(\"href\") ?? \"\";\n if (href.startsWith(\"#bib\")) {\n return { type: \"citation\", refId: href.slice(1), text: el.textContent.trim() };\n }\n if (href.startsWith(\"http://\") || href.startsWith(\"https://\")) {\n return { type: \"link\", url: href, children: parseInlineChildren(el) };\n }\n return { type: \"text\", text: el.textContent.trim() };\n}\n/** Dispatch map for tag names → inline handlers. */\nconst TAG_INLINE_HANDLERS = {\n math: handleMath,\n a: handleAnchor,\n b: (el) => ({ type: \"bold\", children: parseInlineChildren(el) }),\n strong: (el) => ({ type: \"bold\", children: parseInlineChildren(el) }),\n i: (el) => ({ type: \"italic\", children: parseInlineChildren(el) }),\n em: (el) => ({ type: \"italic\", children: parseInlineChildren(el) }),\n code: (el) => ({ type: \"code\", text: el.textContent.trim() }),\n sup: (el) => ({ type: \"superscript\", text: el.textContent.trim() }),\n sub: (el) => ({ type: \"subscript\", text: el.textContent.trim() }),\n};\n/** Tags that should be recursed into as inline containers. */\nconst INLINE_CONTAINER_TAGS = new Set([\"span\", \"cite\"]);\n/** Process a single element node into inline content. */\nfunction processInlineElement(el) {\n // Class-based detection first\n const classMatch = matchClassInline(el);\n if (classMatch)\n return [classMatch];\n // Tag-based dispatch\n const tag = el.tagName?.toLowerCase() ?? \"\";\n const handler = TAG_INLINE_HANDLERS[tag];\n if (handler) {\n const content = handler(el);\n return content ? [content] : [];\n }\n // Inline containers: recurse\n if (INLINE_CONTAINER_TAGS.has(tag)) {\n return parseInlineChildren(el);\n }\n // Fallback: extract text\n const text = el.textContent.trim();\n return text ? [{ type: \"text\", text }] : [];\n}\n/** Parse inline content from child nodes of an element. */\nfunction parseInlineChildren(parent) {\n const result = [];\n for (const node of parent.childNodes) {\n if (node.nodeType === 3) {\n const text = node.textContent;\n if (text)\n result.push({ type: \"text\", text });\n }\n else if (node.nodeType === 1) {\n result.push(...processInlineElement(node));\n }\n }\n return result;\n}\n// ---------------------------------------------------------------------------\n// Block content parsing — individual block handlers\n// ---------------------------------------------------------------------------\nfunction parseParagraph(el) {\n const innerP = q(el, \"p\");\n return { type: \"paragraph\", content: parseInlineChildren(innerP ?? el) };\n}\n/**\n * Parse an ltx_para element which may contain a mix of <p> text and\n * block-level elements (equations, tables, etc.).\n */\nfunction parseLtxPara(el) {\n const blocks = [];\n for (const child of el.childNodes) {\n if (child.nodeType !== 1)\n continue;\n const childEl = child;\n const tag = childEl.tagName?.toLowerCase() ?? \"\";\n if (tag === \"section\" || /^h[1-6]$/.test(tag))\n continue;\n const block = matchBlockByClass(childEl) ?? matchBlockByTag(childEl, tag);\n if (block) {\n blocks.push(block);\n }\n }\n if (blocks.length === 0) {\n blocks.push(parseParagraph(el));\n }\n return blocks;\n}\nfunction parseList(el) {\n const tag = el.tagName?.toLowerCase() ?? \"\";\n const ordered = tag === \"ol\" || hasClass(el, \"ltx_enumerate\");\n const items = [];\n for (const li of qa(el, \"li\")) {\n items.push(parseInlineChildren(li));\n }\n return { type: \"list\", ordered, items };\n}\nfunction parseFigure(el) {\n const labelEl = q(el, \".ltx_caption .ltx_tag_figure\");\n const captionEl = q(el, \".ltx_caption\");\n const id = el.getAttribute(\"id\");\n let label;\n if (labelEl) {\n label = labelEl.textContent.trim().replace(/:$/, \"\");\n }\n let caption;\n if (captionEl) {\n const captionText = captionEl.textContent.trim();\n caption = label\n ? captionText.replace(new RegExp(`^${escapeRegex(label)}[:\\\\s]*`), \"\").trim()\n : captionText;\n }\n const block = { type: \"figure\" };\n if (id)\n block.id = id;\n if (label)\n block.label = label;\n if (caption)\n block.caption = caption;\n return block;\n}\nfunction parseTableHeaders(thead) {\n const headers = [];\n const headerRow = q(thead, \"tr\") ?? q(thead, \".ltx_tr\");\n if (headerRow) {\n for (const th of qa(headerRow, \"th, td, .ltx_td\")) {\n headers.push(stripAnnotations(th));\n }\n }\n return headers;\n}\nfunction parseTableRows(el, thead) {\n const rows = [];\n const tbody = q(el, \"tbody\") ?? q(el, \".ltx_tbody\") ?? el;\n for (const tr of qa(tbody, \"tr, .ltx_tr\")) {\n if (thead && tr.parentNode === thead)\n continue;\n const cells = [];\n for (const td of qa(tr, \"td, th, .ltx_td\")) {\n cells.push(stripAnnotations(td));\n }\n if (cells.length > 0)\n rows.push(cells);\n }\n return rows;\n}\nfunction parseTable(el) {\n const captionEl = q(el, \".ltx_caption\");\n const thead = q(el, \"thead\") ?? q(el, \".ltx_thead\");\n const headers = thead ? parseTableHeaders(thead) : [];\n const rows = parseTableRows(el, thead);\n if (captionEl) {\n return { type: \"table\", caption: captionEl.textContent.trim(), headers, rows };\n }\n return { type: \"table\", headers, rows };\n}\nfunction parseFormula(el) {\n const mathEl = q(el, \"math\");\n const id = el.getAttribute(\"id\");\n const labelEl = q(el, \".ltx_tag_equation\");\n const label = labelEl ? labelEl.textContent.trim() : undefined;\n let tex;\n let text;\n if (mathEl) {\n const alt = mathEl.getAttribute(\"alttext\");\n if (alt)\n tex = alt;\n text = stripAnnotations(mathEl);\n }\n else {\n text = el.textContent.trim();\n }\n const block = { type: \"formula\" };\n if (id)\n block.id = id;\n if (label)\n block.label = label;\n if (tex)\n block.tex = tex;\n if (text && !tex)\n block.text = text;\n return block;\n}\n// ---------------------------------------------------------------------------\n// Block content parsing — dispatch\n// ---------------------------------------------------------------------------\n/** Match block elements by CSS class (checked before tag-based matching). */\nfunction matchBlockByClass(el) {\n if (hasClass(el, \"ltx_equation\") || hasClass(el, \"ltx_eqn_table\"))\n return parseFormula(el);\n if (hasClass(el, \"ltx_tabular\"))\n return parseTable(el);\n return null;\n}\n/** Match block elements by tag name. */\nfunction matchBlockByTag(el, tag) {\n if (tag === \"p\")\n return { type: \"paragraph\", content: parseInlineChildren(el) };\n if (tag === \"ol\" || tag === \"ul\")\n return parseList(el);\n if (tag === \"figure\" && hasClass(el, \"ltx_table\"))\n return parseTable(el);\n if (tag === \"table\")\n return parseTable(el);\n if (tag === \"figure\")\n return parseFigure(el);\n if (tag === \"blockquote\")\n return { type: \"blockquote\", content: parseInlineChildren(el) };\n if (tag === \"pre\")\n return { type: \"preformat\", text: el.textContent.trim() };\n return null;\n}\n/** Try to parse an element as a specific block type. Returns null if not matched. */\nfunction matchBlock(el) {\n const tag = el.tagName?.toLowerCase() ?? \"\";\n if (tag === \"section\" || /^h[1-6]$/.test(tag))\n return null;\n return matchBlockByClass(el) ?? matchBlockByTag(el, tag);\n}\n/** Parse block content from a section's child elements. */\nfunction parseBlockContent(parent) {\n const blocks = [];\n for (const child of parent.childNodes) {\n if (child.nodeType !== 1)\n continue;\n const el = child;\n // ltx_para may contain mixed content (text + embedded equations/tables)\n if (hasClass(el, \"ltx_para\")) {\n blocks.push(...parseLtxPara(el));\n continue;\n }\n const block = matchBlock(el);\n if (block) {\n blocks.push(block);\n continue;\n }\n // Fallback: treat as paragraph if it has text and is not a skipped element\n const tag = el.tagName?.toLowerCase() ?? \"\";\n if (tag !== \"section\" && tag !== \"nav\" && !/^h[1-6]$/.test(tag)) {\n const text = el.textContent.trim();\n if (text)\n blocks.push({ type: \"paragraph\", content: parseInlineChildren(el) });\n }\n }\n return blocks;\n}\n// ---------------------------------------------------------------------------\n// Section parsing\n// ---------------------------------------------------------------------------\n/** Determine section level from CSS class. */\nfunction getSectionLevel(el) {\n if (hasClass(el, \"ltx_subsection\"))\n return 3;\n if (hasClass(el, \"ltx_subsubsection\"))\n return 4;\n if (hasClass(el, \"ltx_paragraph\"))\n return 5;\n return 2;\n}\n/** Find the heading element for a section. */\nfunction findHeading(el, level) {\n return (q(el, `:scope > h${level}`) ??\n q(el, \":scope > h2\") ??\n q(el, \":scope > h3\") ??\n q(el, \":scope > h4\") ??\n q(el, \":scope > h5\") ??\n q(el, \":scope > h6\"));\n}\n/** Parse a section element and its subsections recursively. */\nfunction parseSection(el) {\n const level = getSectionLevel(el);\n const headingEl = findHeading(el, level);\n const title = headingEl ? headingEl.textContent.trim() : \"\";\n const content = parseBlockContent(el);\n const subsections = [];\n for (const child of el.childNodes) {\n if (child.nodeType !== 1)\n continue;\n const childEl = child;\n if (childEl.tagName?.toLowerCase() === \"section\") {\n subsections.push(parseSection(childEl));\n }\n }\n return { title, level, content, subsections };\n}\n// ---------------------------------------------------------------------------\n// Metadata parsing\n// ---------------------------------------------------------------------------\n/** Extract document title from the HTML. */\nfunction parseTitle(root) {\n const titleEl = q(root, \".ltx_title.ltx_title_document\");\n if (!titleEl)\n return \"\";\n const titleClone = parseHtml(titleEl.outerHTML);\n for (const authorInTitle of qa(titleClone, \".ltx_authors\")) {\n authorInTitle.remove();\n }\n return titleClone.textContent.trim();\n}\n/** Try to parse a comma-split part as an author. Returns null for affiliations. */\nfunction parseAuthorName(commaPart) {\n // Take the first non-empty line (affiliations appear on subsequent lines after <br>)\n const lines = commaPart.split(/\\n/);\n const firstLine = lines.find((l) => l.trim() !== \"\")?.trim() ?? \"\";\n if (!firstLine)\n return null;\n // Skip affiliation entries: start with digit (e.g. \"1IBM Research\", \"2 NASA MFSC\")\n if (/^\\d/.test(firstLine))\n return null;\n // Remove trailing superscript digits (affiliation markers attached to names)\n const name = firstLine.replace(/\\d+$/, \"\").trim();\n if (!name)\n return null;\n const words = name.split(/\\s+/);\n const surname = words.pop() ?? name;\n const author = { surname };\n if (words.length > 0)\n author.givenNames = words.join(\" \");\n return author;\n}\n/** Extract authors from the HTML. */\nfunction parseAuthors(root) {\n const authors = [];\n for (const authorEl of qa(root, \".ltx_authors .ltx_personname\")) {\n const fullText = authorEl.textContent.trim();\n if (!fullText)\n continue;\n // Split by comma to handle multiple authors in a single ltx_personname\n for (const part of fullText.split(\",\")) {\n const author = parseAuthorName(part);\n if (author)\n authors.push(author);\n }\n }\n return authors;\n}\n/** Extract abstract text from the HTML. */\nfunction parseAbstract(root) {\n const abstractEl = q(root, \".ltx_abstract\");\n if (!abstractEl)\n return undefined;\n const clone = parseHtml(abstractEl.outerHTML);\n const absTitle = q(clone, \".ltx_title\");\n if (absTitle)\n absTitle.remove();\n for (const note of qa(clone, \".ltx_note\")) {\n note.remove();\n }\n const text = clone.textContent.trim();\n return text || undefined;\n}\n/** Parse document metadata from the HTML. */\nfunction parseMetadata(root) {\n const title = parseTitle(root);\n const authors = parseAuthors(root);\n const abstract = parseAbstract(root);\n const keywordEls = qa(root, \".ltx_keywords .ltx_text\");\n const keywords = keywordEls.map((kw) => kw.textContent.trim()).filter(Boolean);\n const metadata = { title, authors };\n if (abstract)\n metadata.abstract = abstract;\n if (keywords.length > 0)\n metadata.keywords = keywords;\n return metadata;\n}\n// ---------------------------------------------------------------------------\n// References parsing\n// ---------------------------------------------------------------------------\n/** Extract reference text from a bibitem element. */\nfunction extractRefText(item) {\n const bibBlock = q(item, \".ltx_bibblock\");\n if (bibBlock)\n return bibBlock.textContent.trim();\n let text = item.textContent.trim();\n const labelEl = q(item, \".ltx_tag_bibitem\");\n if (labelEl) {\n const labelText = labelEl.textContent.trim();\n if (text.startsWith(labelText)) {\n text = text.slice(labelText.length).trim();\n }\n }\n return text;\n}\n/** Extract DOI from links in a bibitem. */\nfunction extractDoi(item) {\n for (const link of qa(item, \"a\")) {\n const href = link.getAttribute(\"href\") ?? \"\";\n const doiMatch = href.match(/doi\\.org\\/(.+)/);\n if (doiMatch)\n return doiMatch[1];\n }\n return undefined;\n}\n/** Parse bibliography references. */\nfunction parseReferences(root) {\n const refs = [];\n for (const item of qa(root, \".ltx_bibitem\")) {\n const id = item.getAttribute(\"id\") ?? `ref${refs.length + 1}`;\n const text = extractRefText(item);\n const doi = extractDoi(item);\n const ref = { id, text };\n if (doi)\n ref.doi = doi;\n refs.push(ref);\n }\n return refs;\n}\n// ---------------------------------------------------------------------------\n// Acknowledgments parsing\n// ---------------------------------------------------------------------------\nfunction parseAcknowledgments(root) {\n const ackEl = q(root, \".ltx_acknowledgement\");\n if (!ackEl)\n return undefined;\n const clone = parseHtml(ackEl.outerHTML);\n const heading = q(clone, \".ltx_title\");\n if (heading)\n heading.remove();\n const text = clone.textContent.trim();\n return text || undefined;\n}\n// ---------------------------------------------------------------------------\n// Body sections parsing (shared logic)\n// ---------------------------------------------------------------------------\n/** Check if a section element is a body section (not bibliography or acknowledgements). */\nfunction isBodySection(el) {\n return (el.tagName?.toLowerCase() === \"section\" &&\n !hasClass(el, \"ltx_bibliography\") &&\n !hasClass(el, \"ltx_acknowledgement\"));\n}\n/** Parse body sections from an article root element. */\nfunction parseBodySections(article) {\n const sections = [];\n for (const child of article.childNodes) {\n if (child.nodeType !== 1)\n continue;\n const el = child;\n if (isBodySection(el)) {\n sections.push(parseSection(el));\n }\n }\n return sections;\n}\n// ---------------------------------------------------------------------------\n// Public API\n// ---------------------------------------------------------------------------\n/**\n * Parse arXiv LaTeXML HTML into a JatsDocument.\n * @param html - Full HTML string from arXiv\n * @returns Complete parsed document\n */\nexport function parseArxivHtml(html) {\n const root = parseHtml(html);\n const metadata = parseMetadata(root);\n const references = parseReferences(root);\n const article = q(root, \"article.ltx_document\") ?? root;\n const sections = parseBodySections(article);\n const acknowledgments = parseAcknowledgments(root);\n const doc = { metadata, sections, references };\n if (acknowledgments)\n doc.acknowledgments = acknowledgments;\n return doc;\n}\n/**\n * Parse only metadata from arXiv HTML.\n */\nexport function parseArxivHtmlMetadata(html) {\n return parseMetadata(parseHtml(html));\n}\n/**\n * Parse only body sections from arXiv HTML.\n */\nexport function parseArxivHtmlBody(html) {\n const root = parseHtml(html);\n const article = q(root, \"article.ltx_document\") ?? root;\n return parseBodySections(article);\n}\n/**\n * Parse only references from arXiv HTML.\n */\nexport function parseArxivHtmlReferences(html) {\n return parseReferences(parseHtml(html));\n}\n//# sourceMappingURL=arxiv-html-parser.js.map"],"names":["parseHtml"],"mappings":";AAWA,SAAS,EAAE,MAAM,UAAU;AACvB,SAAO,KAAK,cAAc,QAAQ;AACtC;AAEA,SAAS,GAAG,MAAM,UAAU;AACxB,SAAO,KAAK,iBAAiB,QAAQ;AACzC;AAEA,SAAS,SAAS,IAAI,KAAK;AACvB,SAAO,GAAG,UAAU,SAAS,GAAG;AACpC;AACA,SAAS,YAAY,KAAK;AACtB,SAAO,IAAI,QAAQ,uBAAuB,MAAM;AACpD;AAEA,SAAS,iBAAiB,IAAI;AAC1B,QAAM,QAAQA,YAAAA,MAAU,GAAG,SAAS;AACpC,aAAW,OAAO,GAAG,OAAO,YAAY,GAAG;AACvC,QAAI,OAAM;AAAA,EACd;AACA,SAAO,MAAM,YAAY,KAAI;AACjC;AAKA,SAAS,iBAAiB,IAAI;AAC1B,MAAI,SAAS,IAAI,eAAe,GAAG;AAC/B,WAAO,EAAE,MAAM,QAAQ,UAAU,oBAAoB,EAAE,EAAC;AAAA,EAC5D;AACA,MAAI,SAAS,IAAI,iBAAiB,GAAG;AACjC,WAAO,EAAE,MAAM,UAAU,UAAU,oBAAoB,EAAE,EAAC;AAAA,EAC9D;AACA,MAAI,SAAS,IAAI,qBAAqB,GAAG;AACrC,WAAO,EAAE,MAAM,QAAQ,MAAM,GAAG,YAAY,OAAM;AAAA,EACtD;AACA,SAAO;AACX;AACA,SAAS,WAAW,IAAI;AACpB,QAAM,MAAM,GAAG,aAAa,SAAS;AACrC,QAAM,OAAO,iBAAiB,EAAE;AAChC,QAAM,SAAS,EAAE,MAAM,kBAAkB,KAAI;AAC7C,MAAI;AACA,WAAO,MAAM;AACjB,SAAO;AACX;AACA,SAAS,aAAa,IAAI;AACtB,QAAM,OAAO,GAAG,aAAa,MAAM,KAAK;AACxC,MAAI,KAAK,WAAW,MAAM,GAAG;AACzB,WAAO,EAAE,MAAM,YAAY,OAAO,KAAK,MAAM,CAAC,GAAG,MAAM,GAAG,YAAY,KAAI,EAAE;AAAA,EAChF;AACA,MAAI,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,UAAU,GAAG;AAC3D,WAAO,EAAE,MAAM,QAAQ,KAAK,MAAM,UAAU,oBAAoB,EAAE,EAAC;AAAA,EACvE;AACA,SAAO,EAAE,MAAM,QAAQ,MAAM,GAAG,YAAY,OAAM;AACtD;AAEA,MAAM,sBAAsB;AAAA,EACxB,MAAM;AAAA,EACN,GAAG;AAAA,EACH,GAAG,CAAC,QAAQ,EAAE,MAAM,QAAQ,UAAU,oBAAoB,EAAE;EAC5D,QAAQ,CAAC,QAAQ,EAAE,MAAM,QAAQ,UAAU,oBAAoB,EAAE;EACjE,GAAG,CAAC,QAAQ,EAAE,MAAM,UAAU,UAAU,oBAAoB,EAAE;EAC9D,IAAI,CAAC,QAAQ,EAAE,MAAM,UAAU,UAAU,oBAAoB,EAAE;EAC/D,MAAM,CAAC,QAAQ,EAAE,MAAM,QAAQ,MAAM,GAAG,YAAY,KAAI;EACxD,KAAK,CAAC,QAAQ,EAAE,MAAM,eAAe,MAAM,GAAG,YAAY,KAAI;EAC9D,KAAK,CAAC,QAAQ,EAAE,MAAM,aAAa,MAAM,GAAG,YAAY,KAAI;AAChE;AAEA,MAAM,wBAAwB,oBAAI,IAAI,CAAC,QAAQ,MAAM,CAAC;AAEtD,SAAS,qBAAqB,IAAI;AAE9B,QAAM,aAAa,iBAAiB,EAAE;AACtC,MAAI;AACA,WAAO,CAAC,UAAU;AAEtB,QAAM,MAAM,GAAG,SAAS,YAAW,KAAM;AACzC,QAAM,UAAU,oBAAoB,GAAG;AACvC,MAAI,SAAS;AACT,UAAM,UAAU,QAAQ,EAAE;AAC1B,WAAO,UAAU,CAAC,OAAO,IAAI,CAAA;AAAA,EACjC;AAEA,MAAI,sBAAsB,IAAI,GAAG,GAAG;AAChC,WAAO,oBAAoB,EAAE;AAAA,EACjC;AAEA,QAAM,OAAO,GAAG,YAAY,KAAI;AAChC,SAAO,OAAO,CAAC,EAAE,MAAM,QAAQ,KAAI,CAAE,IAAI,CAAA;AAC7C;AAEA,SAAS,oBAAoB,QAAQ;AACjC,QAAM,SAAS,CAAA;AACf,aAAW,QAAQ,OAAO,YAAY;AAClC,QAAI,KAAK,aAAa,GAAG;AACrB,YAAM,OAAO,KAAK;AAClB,UAAI;AACA,eAAO,KAAK,EAAE,MAAM,QAAQ,KAAI,CAAE;AAAA,IAC1C,WACS,KAAK,aAAa,GAAG;AAC1B,aAAO,KAAK,GAAG,qBAAqB,IAAI,CAAC;AAAA,IAC7C;AAAA,EACJ;AACA,SAAO;AACX;AAIA,SAAS,eAAe,IAAI;AACxB,QAAM,SAAS,EAAE,IAAI,GAAG;AACxB,SAAO,EAAE,MAAM,aAAa,SAAS,oBAAoB,UAAU,EAAE,EAAC;AAC1E;AAKA,SAAS,aAAa,IAAI;AACtB,QAAM,SAAS,CAAA;AACf,aAAW,SAAS,GAAG,YAAY;AAC/B,QAAI,MAAM,aAAa;AACnB;AACJ,UAAM,UAAU;AAChB,UAAM,MAAM,QAAQ,SAAS,YAAW,KAAM;AAC9C,QAAI,QAAQ,aAAa,WAAW,KAAK,GAAG;AACxC;AACJ,UAAM,QAAQ,kBAAkB,OAAO,KAAK,gBAAgB,SAAS,GAAG;AACxE,QAAI,OAAO;AACP,aAAO,KAAK,KAAK;AAAA,IACrB;AAAA,EACJ;AACA,MAAI,OAAO,WAAW,GAAG;AACrB,WAAO,KAAK,eAAe,EAAE,CAAC;AAAA,EAClC;AACA,SAAO;AACX;AACA,SAAS,UAAU,IAAI;AACnB,QAAM,MAAM,GAAG,SAAS,YAAW,KAAM;AACzC,QAAM,UAAU,QAAQ,QAAQ,SAAS,IAAI,eAAe;AAC5D,QAAM,QAAQ,CAAA;AACd,aAAW,MAAM,GAAG,IAAI,IAAI,GAAG;AAC3B,UAAM,KAAK,oBAAoB,EAAE,CAAC;AAAA,EACtC;AACA,SAAO,EAAE,MAAM,QAAQ,SAAS,MAAK;AACzC;AACA,SAAS,YAAY,IAAI;AACrB,QAAM,UAAU,EAAE,IAAI,8BAA8B;AACpD,QAAM,YAAY,EAAE,IAAI,cAAc;AACtC,QAAM,KAAK,GAAG,aAAa,IAAI;AAC/B,MAAI;AACJ,MAAI,SAAS;AACT,YAAQ,QAAQ,YAAY,KAAI,EAAG,QAAQ,MAAM,EAAE;AAAA,EACvD;AACA,MAAI;AACJ,MAAI,WAAW;AACX,UAAM,cAAc,UAAU,YAAY,KAAI;AAC9C,cAAU,QACJ,YAAY,QAAQ,IAAI,OAAO,IAAI,YAAY,KAAK,CAAC,SAAS,GAAG,EAAE,EAAE,KAAI,IACzE;AAAA,EACV;AACA,QAAM,QAAQ,EAAE,MAAM,SAAQ;AAC9B,MAAI;AACA,UAAM,KAAK;AACf,MAAI;AACA,UAAM,QAAQ;AAClB,MAAI;AACA,UAAM,UAAU;AACpB,SAAO;AACX;AACA,SAAS,kBAAkB,OAAO;AAC9B,QAAM,UAAU,CAAA;AAChB,QAAM,YAAY,EAAE,OAAO,IAAI,KAAK,EAAE,OAAO,SAAS;AACtD,MAAI,WAAW;AACX,eAAW,MAAM,GAAG,WAAW,iBAAiB,GAAG;AAC/C,cAAQ,KAAK,iBAAiB,EAAE,CAAC;AAAA,IACrC;AAAA,EACJ;AACA,SAAO;AACX;AACA,SAAS,eAAe,IAAI,OAAO;AAC/B,QAAM,OAAO,CAAA;AACb,QAAM,QAAQ,EAAE,IAAI,OAAO,KAAK,EAAE,IAAI,YAAY,KAAK;AACvD,aAAW,MAAM,GAAG,OAAO,aAAa,GAAG;AACvC,QAAI,SAAS,GAAG,eAAe;AAC3B;AACJ,UAAM,QAAQ,CAAA;AACd,eAAW,MAAM,GAAG,IAAI,iBAAiB,GAAG;AACxC,YAAM,KAAK,iBAAiB,EAAE,CAAC;AAAA,IACnC;AACA,QAAI,MAAM,SAAS;AACf,WAAK,KAAK,KAAK;AAAA,EACvB;AACA,SAAO;AACX;AACA,SAAS,WAAW,IAAI;AACpB,QAAM,YAAY,EAAE,IAAI,cAAc;AACtC,QAAM,QAAQ,EAAE,IAAI,OAAO,KAAK,EAAE,IAAI,YAAY;AAClD,QAAM,UAAU,QAAQ,kBAAkB,KAAK,IAAI,CAAA;AACnD,QAAM,OAAO,eAAe,IAAI,KAAK;AACrC,MAAI,WAAW;AACX,WAAO,EAAE,MAAM,SAAS,SAAS,UAAU,YAAY,KAAI,GAAI,SAAS,KAAI;AAAA,EAChF;AACA,SAAO,EAAE,MAAM,SAAS,SAAS,KAAI;AACzC;AACA,SAAS,aAAa,IAAI;AACtB,QAAM,SAAS,EAAE,IAAI,MAAM;AAC3B,QAAM,KAAK,GAAG,aAAa,IAAI;AAC/B,QAAM,UAAU,EAAE,IAAI,mBAAmB;AACzC,QAAM,QAAQ,UAAU,QAAQ,YAAY,KAAI,IAAK;AACrD,MAAI;AACJ,MAAI;AACJ,MAAI,QAAQ;AACR,UAAM,MAAM,OAAO,aAAa,SAAS;AACzC,QAAI;AACA,YAAM;AACV,WAAO,iBAAiB,MAAM;AAAA,EAClC,OACK;AACD,WAAO,GAAG,YAAY,KAAI;AAAA,EAC9B;AACA,QAAM,QAAQ,EAAE,MAAM,UAAS;AAC/B,MAAI;AACA,UAAM,KAAK;AACf,MAAI;AACA,UAAM,QAAQ;AAClB,MAAI;AACA,UAAM,MAAM;AAChB,MAAI,QAAQ,CAAC;AACT,UAAM,OAAO;AACjB,SAAO;AACX;AAKA,SAAS,kBAAkB,IAAI;AAC3B,MAAI,SAAS,IAAI,cAAc,KAAK,SAAS,IAAI,eAAe;AAC5D,WAAO,aAAa,EAAE;AAC1B,MAAI,SAAS,IAAI,aAAa;AAC1B,WAAO,WAAW,EAAE;AACxB,SAAO;AACX;AAEA,SAAS,gBAAgB,IAAI,KAAK;AAC9B,MAAI,QAAQ;AACR,WAAO,EAAE,MAAM,aAAa,SAAS,oBAAoB,EAAE,EAAC;AAChE,MAAI,QAAQ,QAAQ,QAAQ;AACxB,WAAO,UAAU,EAAE;AACvB,MAAI,QAAQ,YAAY,SAAS,IAAI,WAAW;AAC5C,WAAO,WAAW,EAAE;AACxB,MAAI,QAAQ;AACR,WAAO,WAAW,EAAE;AACxB,MAAI,QAAQ;AACR,WAAO,YAAY,EAAE;AACzB,MAAI,QAAQ;AACR,WAAO,EAAE,MAAM,cAAc,SAAS,oBAAoB,EAAE,EAAC;AACjE,MAAI,QAAQ;AACR,WAAO,EAAE,MAAM,aAAa,MAAM,GAAG,YAAY,OAAM;AAC3D,SAAO;AACX;AAEA,SAAS,WAAW,IAAI;AACpB,QAAM,MAAM,GAAG,SAAS,YAAW,KAAM;AACzC,MAAI,QAAQ,aAAa,WAAW,KAAK,GAAG;AACxC,WAAO;AACX,SAAO,kBAAkB,EAAE,KAAK,gBAAgB,IAAI,GAAG;AAC3D;AAEA,SAAS,kBAAkB,QAAQ;AAC/B,QAAM,SAAS,CAAA;AACf,aAAW,SAAS,OAAO,YAAY;AACnC,QAAI,MAAM,aAAa;AACnB;AACJ,UAAM,KAAK;AAEX,QAAI,SAAS,IAAI,UAAU,GAAG;AAC1B,aAAO,KAAK,GAAG,aAAa,EAAE,CAAC;AAC/B;AAAA,IACJ;AACA,UAAM,QAAQ,WAAW,EAAE;AAC3B,QAAI,OAAO;AACP,aAAO,KAAK,KAAK;AACjB;AAAA,IACJ;AAEA,UAAM,MAAM,GAAG,SAAS,YAAW,KAAM;AACzC,QAAI,QAAQ,aAAa,QAAQ,SAAS,CAAC,WAAW,KAAK,GAAG,GAAG;AAC7D,YAAM,OAAO,GAAG,YAAY,KAAI;AAChC,UAAI;AACA,eAAO,KAAK,EAAE,MAAM,aAAa,SAAS,oBAAoB,EAAE,GAAG;AAAA,IAC3E;AAAA,EACJ;AACA,SAAO;AACX;AAKA,SAAS,gBAAgB,IAAI;AACzB,MAAI,SAAS,IAAI,gBAAgB;AAC7B,WAAO;AACX,MAAI,SAAS,IAAI,mBAAmB;AAChC,WAAO;AACX,MAAI,SAAS,IAAI,eAAe;AAC5B,WAAO;AACX,SAAO;AACX;AAEA,SAAS,YAAY,IAAI,OAAO;AAC5B,SAAQ,EAAE,IAAI,aAAa,KAAK,EAAE,KAC9B,EAAE,IAAI,aAAa,KACnB,EAAE,IAAI,aAAa,KACnB,EAAE,IAAI,aAAa,KACnB,EAAE,IAAI,aAAa,KACnB,EAAE,IAAI,aAAa;AAC3B;AAEA,SAAS,aAAa,IAAI;AACtB,QAAM,QAAQ,gBAAgB,EAAE;AAChC,QAAM,YAAY,YAAY,IAAI,KAAK;AACvC,QAAM,QAAQ,YAAY,UAAU,YAAY,KAAI,IAAK;AACzD,QAAM,UAAU,kBAAkB,EAAE;AACpC,QAAM,cAAc,CAAA;AACpB,aAAW,SAAS,GAAG,YAAY;AAC/B,QAAI,MAAM,aAAa;AACnB;AACJ,UAAM,UAAU;AAChB,QAAI,QAAQ,SAAS,YAAW,MAAO,WAAW;AAC9C,kBAAY,KAAK,aAAa,OAAO,CAAC;AAAA,IAC1C;AAAA,EACJ;AACA,SAAO,EAAE,OAAO,OAAO,SAAS,YAAW;AAC/C;AAKA,SAAS,WAAW,MAAM;AACtB,QAAM,UAAU,EAAE,MAAM,+BAA+B;AACvD,MAAI,CAAC;AACD,WAAO;AACX,QAAM,aAAaA,YAAAA,MAAU,QAAQ,SAAS;AAC9C,aAAW,iBAAiB,GAAG,YAAY,cAAc,GAAG;AACxD,kBAAc,OAAM;AAAA,EACxB;AACA,SAAO,WAAW,YAAY,KAAI;AACtC;AAEA,SAAS,gBAAgB,WAAW;AAEhC,QAAM,QAAQ,UAAU,MAAM,IAAI;AAClC,QAAM,YAAY,MAAM,KAAK,CAAC,MAAM,EAAE,WAAW,EAAE,GAAG,KAAI,KAAM;AAChE,MAAI,CAAC;AACD,WAAO;AAEX,MAAI,MAAM,KAAK,SAAS;AACpB,WAAO;AAEX,QAAM,OAAO,UAAU,QAAQ,QAAQ,EAAE,EAAE,KAAI;AAC/C,MAAI,CAAC;AACD,WAAO;AACX,QAAM,QAAQ,KAAK,MAAM,KAAK;AAC9B,QAAM,UAAU,MAAM,IAAG,KAAM;AAC/B,QAAM,SAAS,EAAE,QAAO;AACxB,MAAI,MAAM,SAAS;AACf,WAAO,aAAa,MAAM,KAAK,GAAG;AACtC,SAAO;AACX;AAEA,SAAS,aAAa,MAAM;AACxB,QAAM,UAAU,CAAA;AAChB,aAAW,YAAY,GAAG,MAAM,8BAA8B,GAAG;AAC7D,UAAM,WAAW,SAAS,YAAY,KAAI;AAC1C,QAAI,CAAC;AACD;AAEJ,eAAW,QAAQ,SAAS,MAAM,GAAG,GAAG;AACpC,YAAM,SAAS,gBAAgB,IAAI;AACnC,UAAI;AACA,gBAAQ,KAAK,MAAM;AAAA,IAC3B;AAAA,EACJ;AACA,SAAO;AACX;AAEA,SAAS,cAAc,MAAM;AACzB,QAAM,aAAa,EAAE,MAAM,eAAe;AAC1C,MAAI,CAAC;AACD,WAAO;AACX,QAAM,QAAQA,YAAAA,MAAU,WAAW,SAAS;AAC5C,QAAM,WAAW,EAAE,OAAO,YAAY;AACtC,MAAI;AACA,aAAS,OAAM;AACnB,aAAW,QAAQ,GAAG,OAAO,WAAW,GAAG;AACvC,SAAK,OAAM;AAAA,EACf;AACA,QAAM,OAAO,MAAM,YAAY,KAAI;AACnC,SAAO,QAAQ;AACnB;AAEA,SAAS,cAAc,MAAM;AACzB,QAAM,QAAQ,WAAW,IAAI;AAC7B,QAAM,UAAU,aAAa,IAAI;AACjC,QAAM,WAAW,cAAc,IAAI;AACnC,QAAM,aAAa,GAAG,MAAM,yBAAyB;AACrD,QAAM,WAAW,WAAW,IAAI,CAAC,OAAO,GAAG,YAAY,KAAI,CAAE,EAAE,OAAO,OAAO;AAC7E,QAAM,WAAW,EAAE,OAAO,QAAO;AACjC,MAAI;AACA,aAAS,WAAW;AACxB,MAAI,SAAS,SAAS;AAClB,aAAS,WAAW;AACxB,SAAO;AACX;AAKA,SAAS,eAAe,MAAM;AAC1B,QAAM,WAAW,EAAE,MAAM,eAAe;AACxC,MAAI;AACA,WAAO,SAAS,YAAY,KAAI;AACpC,MAAI,OAAO,KAAK,YAAY,KAAI;AAChC,QAAM,UAAU,EAAE,MAAM,kBAAkB;AAC1C,MAAI,SAAS;AACT,UAAM,YAAY,QAAQ,YAAY,KAAI;AAC1C,QAAI,KAAK,WAAW,SAAS,GAAG;AAC5B,aAAO,KAAK,MAAM,UAAU,MAAM,EAAE,KAAI;AAAA,IAC5C;AAAA,EACJ;AACA,SAAO;AACX;AAEA,SAAS,WAAW,MAAM;AACtB,aAAW,QAAQ,GAAG,MAAM,GAAG,GAAG;AAC9B,UAAM,OAAO,KAAK,aAAa,MAAM,KAAK;AAC1C,UAAM,WAAW,KAAK,MAAM,gBAAgB;AAC5C,QAAI;AACA,aAAO,SAAS,CAAC;AAAA,EACzB;AACA,SAAO;AACX;AAEA,SAAS,gBAAgB,MAAM;AAC3B,QAAM,OAAO,CAAA;AACb,aAAW,QAAQ,GAAG,MAAM,cAAc,GAAG;AACzC,UAAM,KAAK,KAAK,aAAa,IAAI,KAAK,MAAM,KAAK,SAAS,CAAC;AAC3D,UAAM,OAAO,eAAe,IAAI;AAChC,UAAM,MAAM,WAAW,IAAI;AAC3B,UAAM,MAAM,EAAE,IAAI,KAAI;AACtB,QAAI;AACA,UAAI,MAAM;AACd,SAAK,KAAK,GAAG;AAAA,EACjB;AACA,SAAO;AACX;AAIA,SAAS,qBAAqB,MAAM;AAChC,QAAM,QAAQ,EAAE,MAAM,sBAAsB;AAC5C,MAAI,CAAC;AACD,WAAO;AACX,QAAM,QAAQA,YAAAA,MAAU,MAAM,SAAS;AACvC,QAAM,UAAU,EAAE,OAAO,YAAY;AACrC,MAAI;AACA,YAAQ,OAAM;AAClB,QAAM,OAAO,MAAM,YAAY,KAAI;AACnC,SAAO,QAAQ;AACnB;AAKA,SAAS,cAAc,IAAI;AACvB,SAAQ,GAAG,SAAS,YAAW,MAAO,aAClC,CAAC,SAAS,IAAI,kBAAkB,KAChC,CAAC,SAAS,IAAI,qBAAqB;AAC3C;AAEA,SAAS,kBAAkB,SAAS;AAChC,QAAM,WAAW,CAAA;AACjB,aAAW,SAAS,QAAQ,YAAY;AACpC,QAAI,MAAM,aAAa;AACnB;AACJ,UAAM,KAAK;AACX,QAAI,cAAc,EAAE,GAAG;AACnB,eAAS,KAAK,aAAa,EAAE,CAAC;AAAA,IAClC;AAAA,EACJ;AACA,SAAO;AACX;AASO,SAAS,eAAe,MAAM;AACjC,QAAM,OAAOA,YAAAA,MAAU,IAAI;AAC3B,QAAM,WAAW,cAAc,IAAI;AACnC,QAAM,aAAa,gBAAgB,IAAI;AACvC,QAAM,UAAU,EAAE,MAAM,sBAAsB,KAAK;AACnD,QAAM,WAAW,kBAAkB,OAAO;AAC1C,QAAM,kBAAkB,qBAAqB,IAAI;AACjD,QAAM,MAAM,EAAE,UAAU,UAAU,WAAU;AAC5C,MAAI;AACA,QAAI,kBAAkB;AAC1B,SAAO;AACX;","x_google_ignoreList":[0]}
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import { readFile, writeFile, stat } from "node:fs/promises";
|
|
2
|
+
import { parseArxivHtml } from "./arxiv-html-parser.js";
|
|
3
|
+
import { parseJatsMetadata, parseJatsBody, parseJatsReferences, parseJatsBackMatter } from "./jats-parser.js";
|
|
4
|
+
import { writeMarkdown } from "./markdown-writer.js";
|
|
5
|
+
async function convertPmcXmlToMarkdown(xmlPath, mdPath, metaPath) {
|
|
6
|
+
try {
|
|
7
|
+
const xml = await readFile(xmlPath, "utf-8");
|
|
8
|
+
const metadata = parseJatsMetadata(xml);
|
|
9
|
+
const sections = parseJatsBody(xml);
|
|
10
|
+
const references = parseJatsReferences(xml);
|
|
11
|
+
const backMatter = parseJatsBackMatter(xml);
|
|
12
|
+
const doc = { metadata, sections, references };
|
|
13
|
+
if (backMatter.acknowledgments)
|
|
14
|
+
doc.acknowledgments = backMatter.acknowledgments;
|
|
15
|
+
if (backMatter.appendices)
|
|
16
|
+
doc.appendices = backMatter.appendices;
|
|
17
|
+
if (backMatter.footnotes)
|
|
18
|
+
doc.footnotes = backMatter.footnotes;
|
|
19
|
+
if (backMatter.floats)
|
|
20
|
+
doc.floats = backMatter.floats;
|
|
21
|
+
if (backMatter.notes)
|
|
22
|
+
doc.notes = backMatter.notes;
|
|
23
|
+
const md = writeMarkdown(doc);
|
|
24
|
+
await writeFile(mdPath, md, "utf-8");
|
|
25
|
+
if (metaPath) {
|
|
26
|
+
try {
|
|
27
|
+
await stat(metaPath);
|
|
28
|
+
const metaRaw = await readFile(metaPath, "utf-8");
|
|
29
|
+
const meta = JSON.parse(metaRaw);
|
|
30
|
+
const mdStat = await stat(mdPath);
|
|
31
|
+
meta.files.markdown = {
|
|
32
|
+
filename: "fulltext.md",
|
|
33
|
+
source: "conversion",
|
|
34
|
+
retrievedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
35
|
+
size: mdStat.size,
|
|
36
|
+
convertedFrom: "fulltext.xml"
|
|
37
|
+
};
|
|
38
|
+
await writeFile(metaPath, `${JSON.stringify(meta, null, 2)}
|
|
39
|
+
`, "utf-8");
|
|
40
|
+
} catch {
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
const result = { success: true };
|
|
44
|
+
result.title = metadata.title;
|
|
45
|
+
result.sections = sections.length;
|
|
46
|
+
result.references = references.length;
|
|
47
|
+
return result;
|
|
48
|
+
} catch (err) {
|
|
49
|
+
const result = { success: false };
|
|
50
|
+
result.error = err instanceof Error ? err.message : String(err);
|
|
51
|
+
return result;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
async function convertArxivHtmlToMarkdown(htmlPath, mdPath, metaPath) {
|
|
55
|
+
try {
|
|
56
|
+
const html = await readFile(htmlPath, "utf-8");
|
|
57
|
+
const doc = parseArxivHtml(html);
|
|
58
|
+
const md = writeMarkdown(doc);
|
|
59
|
+
await writeFile(mdPath, md, "utf-8");
|
|
60
|
+
if (metaPath) {
|
|
61
|
+
try {
|
|
62
|
+
await stat(metaPath);
|
|
63
|
+
const metaRaw = await readFile(metaPath, "utf-8");
|
|
64
|
+
const meta = JSON.parse(metaRaw);
|
|
65
|
+
const mdStat = await stat(mdPath);
|
|
66
|
+
meta.files.markdown = {
|
|
67
|
+
filename: "fulltext.md",
|
|
68
|
+
source: "conversion",
|
|
69
|
+
retrievedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
70
|
+
size: mdStat.size,
|
|
71
|
+
convertedFrom: "fulltext.html"
|
|
72
|
+
};
|
|
73
|
+
await writeFile(metaPath, `${JSON.stringify(meta, null, 2)}
|
|
74
|
+
`, "utf-8");
|
|
75
|
+
} catch {
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
const result = { success: true };
|
|
79
|
+
result.title = doc.metadata.title;
|
|
80
|
+
result.sections = doc.sections.length;
|
|
81
|
+
result.references = doc.references.length;
|
|
82
|
+
return result;
|
|
83
|
+
} catch (err) {
|
|
84
|
+
const result = { success: false };
|
|
85
|
+
result.error = err instanceof Error ? err.message : String(err);
|
|
86
|
+
return result;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
export {
|
|
90
|
+
convertArxivHtmlToMarkdown,
|
|
91
|
+
convertPmcXmlToMarkdown
|
|
92
|
+
};
|
|
93
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sources":["../../../../../../node_modules/@ncukondo/academic-fulltext/dist/convert/index.js"],"sourcesContent":["/**\n * Conversion orchestrator for PMC XML and arXiv HTML to Markdown.\n *\n * Ties together the parsers and Markdown writer with file I/O.\n */\nimport { readFile, stat, writeFile } from \"node:fs/promises\";\nimport { parseArxivHtml } from \"./arxiv-html-parser.js\";\nimport { parseJatsBackMatter, parseJatsBody, parseJatsMetadata, parseJatsReferences, } from \"./jats-parser.js\";\nimport { writeMarkdown } from \"./markdown-writer.js\";\n/**\n * Convert a PMC JATS XML file to Markdown.\n *\n * Reads the XML, parses it into a JatsDocument, writes Markdown,\n * and optionally updates meta.json.\n */\nexport async function convertPmcXmlToMarkdown(xmlPath, mdPath, metaPath) {\n try {\n const xml = await readFile(xmlPath, \"utf-8\");\n // Parse\n const metadata = parseJatsMetadata(xml);\n const sections = parseJatsBody(xml);\n const references = parseJatsReferences(xml);\n const backMatter = parseJatsBackMatter(xml);\n const doc = { metadata, sections, references };\n if (backMatter.acknowledgments)\n doc.acknowledgments = backMatter.acknowledgments;\n if (backMatter.appendices)\n doc.appendices = backMatter.appendices;\n if (backMatter.footnotes)\n doc.footnotes = backMatter.footnotes;\n if (backMatter.floats)\n doc.floats = backMatter.floats;\n if (backMatter.notes)\n doc.notes = backMatter.notes;\n // Write Markdown\n const md = writeMarkdown(doc);\n await writeFile(mdPath, md, \"utf-8\");\n // Update meta.json if path provided and file exists\n if (metaPath) {\n try {\n await stat(metaPath);\n const metaRaw = await readFile(metaPath, \"utf-8\");\n const meta = JSON.parse(metaRaw);\n const mdStat = await stat(mdPath);\n meta.files.markdown = {\n filename: \"fulltext.md\",\n source: \"conversion\",\n retrievedAt: new Date().toISOString(),\n size: mdStat.size,\n convertedFrom: \"fulltext.xml\",\n };\n await writeFile(metaPath, `${JSON.stringify(meta, null, 2)}\\n`, \"utf-8\");\n }\n catch {\n // meta.json doesn't exist or can't be read, skip update\n }\n }\n const result = { success: true };\n result.title = metadata.title;\n result.sections = sections.length;\n result.references = references.length;\n return result;\n }\n catch (err) {\n const result = { success: false };\n result.error = err instanceof Error ? err.message : String(err);\n return result;\n }\n}\n/**\n * Convert an arXiv HTML file to Markdown.\n *\n * Reads the HTML, parses it into a JatsDocument via the arXiv HTML parser,\n * writes Markdown using the shared writer, and optionally updates meta.json.\n */\nexport async function convertArxivHtmlToMarkdown(htmlPath, mdPath, metaPath) {\n try {\n const html = await readFile(htmlPath, \"utf-8\");\n const doc = parseArxivHtml(html);\n // Write Markdown\n const md = writeMarkdown(doc);\n await writeFile(mdPath, md, \"utf-8\");\n // Update meta.json if path provided and file exists\n if (metaPath) {\n try {\n await stat(metaPath);\n const metaRaw = await readFile(metaPath, \"utf-8\");\n const meta = JSON.parse(metaRaw);\n const mdStat = await stat(mdPath);\n meta.files.markdown = {\n filename: \"fulltext.md\",\n source: \"conversion\",\n retrievedAt: new Date().toISOString(),\n size: mdStat.size,\n convertedFrom: \"fulltext.html\",\n };\n await writeFile(metaPath, `${JSON.stringify(meta, null, 2)}\\n`, \"utf-8\");\n }\n catch {\n // meta.json doesn't exist or can't be read, skip update\n }\n }\n const result = { success: true };\n result.title = doc.metadata.title;\n result.sections = doc.sections.length;\n result.references = doc.references.length;\n return result;\n }\n catch (err) {\n const result = { success: false };\n result.error = err instanceof Error ? err.message : String(err);\n return result;\n }\n}\n//# sourceMappingURL=index.js.map"],"names":[],"mappings":";;;;AAeO,eAAe,wBAAwB,SAAS,QAAQ,UAAU;AACrE,MAAI;AACA,UAAM,MAAM,MAAM,SAAS,SAAS,OAAO;AAE3C,UAAM,WAAW,kBAAkB,GAAG;AACtC,UAAM,WAAW,cAAc,GAAG;AAClC,UAAM,aAAa,oBAAoB,GAAG;AAC1C,UAAM,aAAa,oBAAoB,GAAG;AAC1C,UAAM,MAAM,EAAE,UAAU,UAAU,WAAU;AAC5C,QAAI,WAAW;AACX,UAAI,kBAAkB,WAAW;AACrC,QAAI,WAAW;AACX,UAAI,aAAa,WAAW;AAChC,QAAI,WAAW;AACX,UAAI,YAAY,WAAW;AAC/B,QAAI,WAAW;AACX,UAAI,SAAS,WAAW;AAC5B,QAAI,WAAW;AACX,UAAI,QAAQ,WAAW;AAE3B,UAAM,KAAK,cAAc,GAAG;AAC5B,UAAM,UAAU,QAAQ,IAAI,OAAO;AAEnC,QAAI,UAAU;AACV,UAAI;AACA,cAAM,KAAK,QAAQ;AACnB,cAAM,UAAU,MAAM,SAAS,UAAU,OAAO;AAChD,cAAM,OAAO,KAAK,MAAM,OAAO;AAC/B,cAAM,SAAS,MAAM,KAAK,MAAM;AAChC,aAAK,MAAM,WAAW;AAAA,UAClB,UAAU;AAAA,UACV,QAAQ;AAAA,UACR,cAAa,oBAAI,KAAI,GAAG,YAAW;AAAA,UACnC,MAAM,OAAO;AAAA,UACb,eAAe;AAAA,QACnC;AACgB,cAAM,UAAU,UAAU,GAAG,KAAK,UAAU,MAAM,MAAM,CAAC,CAAC;AAAA,GAAM,OAAO;AAAA,MAC3E,QACM;AAAA,MAEN;AAAA,IACJ;AACA,UAAM,SAAS,EAAE,SAAS,KAAI;AAC9B,WAAO,QAAQ,SAAS;AACxB,WAAO,WAAW,SAAS;AAC3B,WAAO,aAAa,WAAW;AAC/B,WAAO;AAAA,EACX,SACO,KAAK;AACR,UAAM,SAAS,EAAE,SAAS,MAAK;AAC/B,WAAO,QAAQ,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC9D,WAAO;AAAA,EACX;AACJ;AAOO,eAAe,2BAA2B,UAAU,QAAQ,UAAU;AACzE,MAAI;AACA,UAAM,OAAO,MAAM,SAAS,UAAU,OAAO;AAC7C,UAAM,MAAM,eAAe,IAAI;AAE/B,UAAM,KAAK,cAAc,GAAG;AAC5B,UAAM,UAAU,QAAQ,IAAI,OAAO;AAEnC,QAAI,UAAU;AACV,UAAI;AACA,cAAM,KAAK,QAAQ;AACnB,cAAM,UAAU,MAAM,SAAS,UAAU,OAAO;AAChD,cAAM,OAAO,KAAK,MAAM,OAAO;AAC/B,cAAM,SAAS,MAAM,KAAK,MAAM;AAChC,aAAK,MAAM,WAAW;AAAA,UAClB,UAAU;AAAA,UACV,QAAQ;AAAA,UACR,cAAa,oBAAI,KAAI,GAAG,YAAW;AAAA,UACnC,MAAM,OAAO;AAAA,UACb,eAAe;AAAA,QACnC;AACgB,cAAM,UAAU,UAAU,GAAG,KAAK,UAAU,MAAM,MAAM,CAAC,CAAC;AAAA,GAAM,OAAO;AAAA,MAC3E,QACM;AAAA,MAEN;AAAA,IACJ;AACA,UAAM,SAAS,EAAE,SAAS,KAAI;AAC9B,WAAO,QAAQ,IAAI,SAAS;AAC5B,WAAO,WAAW,IAAI,SAAS;AAC/B,WAAO,aAAa,IAAI,WAAW;AACnC,WAAO;AAAA,EACX,SACO,KAAK;AACR,UAAM,SAAS,EAAE,SAAS,MAAK;AAC/B,WAAO,QAAQ,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC9D,WAAO;AAAA,EACX;AACJ;","x_google_ignoreList":[0]}
|