pembot 0.0.3__tar.gz → 0.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pembot might be problematic. Click here for more details.
- {pembot-0.0.3 → pembot-0.0.5}/PKG-INFO +1 -1
- pembot-0.0.5/pembot/.git/COMMIT_EDITMSG +1 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/index +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/logs/HEAD +1 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/logs/refs/heads/main +1 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/logs/refs/remotes/origin/main +1 -0
- pembot-0.0.5/pembot/.git/objects/0b/db4169fc0f312b8698f1df17a258fff163aeaa +0 -0
- pembot-0.0.5/pembot/.git/objects/1f/83a471c8119f7794d98c049170a5d7d07a4b71 +0 -0
- pembot-0.0.5/pembot/.git/objects/41/cbeb6bcb4c6fa9ef9be571082d95ecb4ea0ee3 +0 -0
- pembot-0.0.5/pembot/.git/objects/63/1700a51c8fa97b543991f5f61bfcd1e7e1327d +0 -0
- pembot-0.0.5/pembot/.git/objects/ab/139d2cd4798dd8e2c565b80440b1a44b376126 +0 -0
- pembot-0.0.5/pembot/.git/objects/bf/068a0714e2145de83a5c004f4213b091439d0e +0 -0
- pembot-0.0.5/pembot/.git/objects/d0/937f7d832266337289d5ec09459f931a46fcf7 +0 -0
- pembot-0.0.5/pembot/.git/objects/fc/988aab7e2d46396dc595ad24345e8e77dda0e4 +0 -0
- pembot-0.0.5/pembot/.git/refs/heads/main +1 -0
- pembot-0.0.5/pembot/.git/refs/remotes/origin/main +1 -0
- pembot-0.0.5/pembot/AnyToText/convertor.py +364 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/__init__.py +1 -1
- {pembot-0.0.3 → pembot-0.0.5}/pembot/config/config.yaml +1 -1
- {pembot-0.0.3 → pembot-0.0.5}/pembot/main.py +26 -8
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/extract.py +266 -309
- {pembot-0.0.3 → pembot-0.0.5}/pembot/query.py +15 -9
- pembot-0.0.3/pembot/.git/COMMIT_EDITMSG +0 -1
- pembot-0.0.3/pembot/.git/refs/heads/main +0 -1
- pembot-0.0.3/pembot/.git/refs/remotes/origin/main +0 -1
- pembot-0.0.3/pembot/AnyToText/convertor.py +0 -260
- {pembot-0.0.3 → pembot-0.0.5}/LICENSE +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/HEAD +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/config +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/description +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/applypatch-msg.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/commit-msg.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/fsmonitor-watchman.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/post-update.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/pre-applypatch.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/pre-commit.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/pre-merge-commit.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/pre-push.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/pre-rebase.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/pre-receive.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/prepare-commit-msg.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/push-to-checkout.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/sendemail-validate.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/update.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/info/exclude +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/logs/refs/remotes/origin/HEAD +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/0a/fb3a98cdc55b1434b44534ec2bf22c56cfa26c +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/0c/8d9b2690545bf1906b05cd9f18b783b3eb74f1 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/18/28e18ab80aa64d334b26428708140e280cbc63 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/19/f61df7dbd562d04f561288677bbf2f18f5dff7 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/28/db0ab48059acccd7d257aa02e52e9b6b83a4a5 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/35/97e518a8658280be9f377f78edf1dfa1f23814 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/3d/07d3b29ff53d95de3898fb786d61732f210515 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/3e/cf23eb95123287531d708a21d4ba88d92ccabb +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/3f/78215d7e17da726fb352fd92b3c117db9b63ba +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/3f/e072cf3cb6a9f30c3e9936e3ddf622e80270d0 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/51/9e780574933d7627a083222bd10dd74f430904 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/61/46a371b9c1bd9f51af273f11f986cfd1bedeba +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/64/00040794955d17c9a1fe1aaaea59f2c4822177 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/6d/7a865a23b1cb4182f67907820104ced48b11c9 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/72/f047cda92abcd1ddc857f6461de605f8668331 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/73/2e98f08bc806c331b06847fc8c743f545499e5 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/86/cdaec229f1fbebf43042266b03878944669f25 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/87/d6df5217a4a374f8c1211a05f9bd657f72c9a7 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/8b/5be2af9b16f290549193859c214cd9072212e8 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/93/8f29d9b4b1ae86e39dddf9e3d115a82ddfc9b6 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/9b/123713e30fc9e225f9ac8ff5b02f8f8cf86456 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/ab/c6b15265171457b41e2cfdaf3b8c3994a59eb7 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/ac/9c9018c62fa30dc142665c1b5a375f4e056880 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/b1/1173d9b68db117437ccb9551461152e1e8a77d +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/b2/4e79ab07fe9e68781961a25ff9f1dbb1546fbb +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/b8/eea52176ffa4d88c5a9976bee26092421565d3 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/bf/32a7e6872e5dc4025ee3df3c921ec7ade0855f +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/c0/793458db6e1bee7f79f1a504fb8ff4963f8ed3 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/c2/443060c07101948487cfa93cc39e082e9e0f5f +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/e5/3070f2b07f45d031444b09b1b38658f3caf29e +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/e7/911a702079a6144997ea4e70f59abbe59ec2bc +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/e9/1172752e9a421ae463112d2b0506b37498c98d +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/ea/0af89e61a882c5afc2a8c281b2d96f174bfe58 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/eb/75e1c49f1e5b79dca17ccdbec8067756523238 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/f1/655afa1c5636c8d58969e3194bb770aefbc552 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/f4/e991088a63def67a30a2b8bbdb4d58514abab8 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/f8/cbb5bfd1503e66cec2c593362c60a317b6d300 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/f9/98e1f01c2bf0a20159fc851327af05beb3ac88 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/fa/9c9a62ec1203a5868b033ded428c2382c4e1b6 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/fb/6c90c9ce5e0cdfbe074a3f060afc66f62eefde +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/fc/e56f1e09d09a05b9babf796fb40bece176f3a2 +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.idx +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.pack +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.rev +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/packed-refs +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/refs/remotes/origin/HEAD +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/.gitignore +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/AnyToText/__init__.py +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/LICENSE +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/TextEmbedder/__init__.py +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/TextEmbedder/gemini_embedder.py +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/TextEmbedder/mongodb_embedder.py +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/TextEmbedder/mongodb_index_creator.py +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/TextEmbedder/vector_query.py +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/gartner.py +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/output_structure_local.py +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/HEAD +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/config +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/description +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/applypatch-msg.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/commit-msg.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/fsmonitor-watchman.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/post-update.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/pre-applypatch.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/pre-commit.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/pre-merge-commit.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/pre-push.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/pre-rebase.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/pre-receive.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/prepare-commit-msg.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/push-to-checkout.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/sendemail-validate.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/update.sample +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/index +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/info/exclude +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/logs/HEAD +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/logs/refs/heads/main +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/logs/refs/remotes/origin/HEAD +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.idx +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.pack +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.rev +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/packed-refs +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/refs/heads/main +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/refs/remotes/origin/HEAD +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/LICENSE +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/README.md +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/__init__.py +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/config/config.yaml +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/requirements.txt +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/pem.py +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/utils/__init__.py +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/utils/inference_client.py +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pembot/utils/string_tools.py +0 -0
- {pembot-0.0.3 → pembot-0.0.5}/pyproject.toml +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
fixed the output_dir bug; fixed the excel to json function; ran some tests on convertor; incremented the version on the package; removed dependency on schema / structure, and shifted required fields to a pickle file path in the cli args;
|
|
Binary file
|
|
@@ -4,3 +4,4 @@ ac9c9018c62fa30dc142665c1b5a375f4e056880 72f047cda92abcd1ddc857f6461de605f866833
|
|
|
4
4
|
72f047cda92abcd1ddc857f6461de605f8668331 e91172752e9a421ae463112d2b0506b37498c98d cyto <silverstone965@gmail.com> 1748881846 +0530 commit: added gemini to the embedders and llms ladders; redeclared the required fields;
|
|
5
5
|
e91172752e9a421ae463112d2b0506b37498c98d 0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 cyto <silverstone965@gmail.com> 1749716350 +0530 commit: added a pem blog chunking module for updating from local, and, an embedding loop to embed all the blogs, with document id as the filter in the search, and the first line title as the filter in updation
|
|
6
6
|
0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 eb75e1c49f1e5b79dca17ccdbec8067756523238 cyto <silverstone965@gmail.com> 1750856653 +0530 commit: made arrangements for the cases when custom file bytes are to be processed to text output; handled a ollama running / crashing error
|
|
7
|
+
eb75e1c49f1e5b79dca17ccdbec8067756523238 0bdb4169fc0f312b8698f1df17a258fff163aeaa cyto <silverstone965@gmail.com> 1750937276 +0530 commit: fixed the output_dir bug; fixed the excel to json function; ran some tests on convertor; incremented the version on the package; removed dependency on schema / structure, and shifted required fields to a pickle file path in the cli args;
|
|
@@ -4,3 +4,4 @@ ac9c9018c62fa30dc142665c1b5a375f4e056880 72f047cda92abcd1ddc857f6461de605f866833
|
|
|
4
4
|
72f047cda92abcd1ddc857f6461de605f8668331 e91172752e9a421ae463112d2b0506b37498c98d cyto <silverstone965@gmail.com> 1748881846 +0530 commit: added gemini to the embedders and llms ladders; redeclared the required fields;
|
|
5
5
|
e91172752e9a421ae463112d2b0506b37498c98d 0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 cyto <silverstone965@gmail.com> 1749716350 +0530 commit: added a pem blog chunking module for updating from local, and, an embedding loop to embed all the blogs, with document id as the filter in the search, and the first line title as the filter in updation
|
|
6
6
|
0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 eb75e1c49f1e5b79dca17ccdbec8067756523238 cyto <silverstone965@gmail.com> 1750856653 +0530 commit: made arrangements for the cases when custom file bytes are to be processed to text output; handled a ollama running / crashing error
|
|
7
|
+
eb75e1c49f1e5b79dca17ccdbec8067756523238 0bdb4169fc0f312b8698f1df17a258fff163aeaa cyto <silverstone965@gmail.com> 1750937276 +0530 commit: fixed the output_dir bug; fixed the excel to json function; ran some tests on convertor; incremented the version on the package; removed dependency on schema / structure, and shifted required fields to a pickle file path in the cli args;
|
|
@@ -3,3 +3,4 @@ ac9c9018c62fa30dc142665c1b5a375f4e056880 72f047cda92abcd1ddc857f6461de605f866833
|
|
|
3
3
|
72f047cda92abcd1ddc857f6461de605f8668331 e91172752e9a421ae463112d2b0506b37498c98d cyto <silverstone965@gmail.com> 1748881859 +0530 update by push
|
|
4
4
|
e91172752e9a421ae463112d2b0506b37498c98d 0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 cyto <silverstone965@gmail.com> 1749716371 +0530 update by push
|
|
5
5
|
0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 eb75e1c49f1e5b79dca17ccdbec8067756523238 cyto <silverstone965@gmail.com> 1750856672 +0530 update by push
|
|
6
|
+
eb75e1c49f1e5b79dca17ccdbec8067756523238 0bdb4169fc0f312b8698f1df17a258fff163aeaa cyto <silverstone965@gmail.com> 1750937389 +0530 update by push
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0bdb4169fc0f312b8698f1df17a258fff163aeaa
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0bdb4169fc0f312b8698f1df17a258fff163aeaa
|
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
from tempfile import TemporaryDirectory
|
|
2
|
+
import mimetypes
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pembot.pdf2markdown.extract import MarkdownPDFExtractor
|
|
5
|
+
import os
|
|
6
|
+
import json
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from typing import Literal, Union, Dict, Any, List
|
|
9
|
+
import tempfile
|
|
10
|
+
from datetime import datetime, date
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
PandasReadEngineType = Literal['xlrd', 'openpyxl', 'odf', 'pyxlsb', 'calamine', None]
|
|
14
|
+
|
|
15
|
+
EXCEL_FILE_TYPES= [
|
|
16
|
+
'text/csv',
|
|
17
|
+
'application/vnd.ms-excel',
|
|
18
|
+
'application/msexcel',
|
|
19
|
+
'application/x-msexcel',
|
|
20
|
+
'application/x-ms-excel',
|
|
21
|
+
'application/x-excel',
|
|
22
|
+
'application/x-dos_ms_excel',
|
|
23
|
+
'application/x-dos_ms_excel',
|
|
24
|
+
'application/xls',
|
|
25
|
+
'application/x-xls',
|
|
26
|
+
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
27
|
+
'application/vnd.oasis.opendocument.spreadsheet',
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class Convertor():
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def __init__(self, myfile: Path | None= None, output_dir: Path | None= None, file_bytes: bytes | None= None, suffix: str | None= None, file_type: str | None= None):
|
|
35
|
+
|
|
36
|
+
self.output= ""
|
|
37
|
+
|
|
38
|
+
# file_type can be pdf, excel, etc.
|
|
39
|
+
if output_dir is None and myfile is None and file_bytes is not None and suffix is not None:
|
|
40
|
+
with tempfile.TemporaryDirectory() as dp:
|
|
41
|
+
with tempfile.NamedTemporaryFile(suffix= suffix, mode= 'wb') as fp:
|
|
42
|
+
fp.write(file_bytes)
|
|
43
|
+
myfile= Path(fp.name)
|
|
44
|
+
output_dir= Path(dp)
|
|
45
|
+
if file_type == 'pdf':
|
|
46
|
+
extractor= MarkdownPDFExtractor(str(myfile), output_path= str(output_dir), page_delimiter= "-- NEXT PAGE --")
|
|
47
|
+
extractor.extract()
|
|
48
|
+
with open(output_dir / (myfile.stem + '.md')) as output_file:
|
|
49
|
+
self.output= output_file.read()
|
|
50
|
+
elif file_type == 'excel':
|
|
51
|
+
self.input_filepath= myfile
|
|
52
|
+
self.json_filepath = output_dir / (myfile.stem + ".json")
|
|
53
|
+
self.convert_file_to_json()
|
|
54
|
+
with open(output_dir / (myfile.stem + '.json')) as output_file:
|
|
55
|
+
self.output= output_file.read()
|
|
56
|
+
|
|
57
|
+
elif output_dir is not None and myfile is not None:
|
|
58
|
+
print("got output path for conversion: ", output_dir)
|
|
59
|
+
mt= mimetypes.guess_file_type(str(myfile))[0]
|
|
60
|
+
|
|
61
|
+
self.output_dir= output_dir
|
|
62
|
+
self.input_filepath= myfile
|
|
63
|
+
base_name, _ = os.path.splitext(myfile.name)
|
|
64
|
+
self.json_filepath = output_dir / 'json' / (base_name + ".json")
|
|
65
|
+
|
|
66
|
+
if mt == 'application/json':
|
|
67
|
+
print("the file was json")
|
|
68
|
+
elif mt == 'application/pdf':
|
|
69
|
+
print("the file was pdf, outputting in: ", output_dir)
|
|
70
|
+
extractor= MarkdownPDFExtractor(str(myfile), output_path= str(self.output_dir), page_delimiter= "-- NEXT PAGE --")
|
|
71
|
+
extractor.extract()
|
|
72
|
+
|
|
73
|
+
elif mt in EXCEL_FILE_TYPES:
|
|
74
|
+
self.convert_file_to_json()
|
|
75
|
+
|
|
76
|
+
else:
|
|
77
|
+
print(mt)
|
|
78
|
+
|
|
79
|
+
def convert_file_to_json(
|
|
80
|
+
self,
|
|
81
|
+
sheet_to_convert: Union[str, int, None] = None, # Relevant for Excel/ODS
|
|
82
|
+
orient: Literal['dict', 'list', 'series', 'split', 'records', 'index'] = 'records', # Corrected type hint
|
|
83
|
+
date_format: Union[str, None] = 'iso', # 'iso', 'epoch', or None
|
|
84
|
+
csv_encoding: str = 'utf-8', # For reading CSV files
|
|
85
|
+
excel_ods_engine: PandasReadEngineType = None # For Excel/ODS, e.g., 'openpyxl', 'xlrd', 'odf'
|
|
86
|
+
) -> bool:
|
|
87
|
+
"""
|
|
88
|
+
Converts an Excel, ODS, or CSV file (or a specific Excel/ODS sheet)
|
|
89
|
+
into an equivalent JSON format.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
sheet_to_convert (str | int | None, optional):
|
|
93
|
+
- For Excel/ODS:
|
|
94
|
+
- If None (default): Converts all sheets. The JSON output will be a
|
|
95
|
+
dictionary where keys are sheet names and values are the JSON
|
|
96
|
+
representation of each sheet.
|
|
97
|
+
- If str: Name of the specific sheet to convert.
|
|
98
|
+
- If int: Index of the specific sheet to convert (0-based).
|
|
99
|
+
If a specific sheet is requested, the JSON output will directly be
|
|
100
|
+
the representation of that sheet.
|
|
101
|
+
- For CSV: This parameter is ignored. The entire CSV is processed.
|
|
102
|
+
orient (str, optional): Pandas DataFrame.to_dict() orientation for each sheet/CSV.
|
|
103
|
+
Default: 'records'. See pandas.DataFrame.to_dict() documentation.
|
|
104
|
+
date_format (str | None, optional): Format for datetime objects.
|
|
105
|
+
- 'iso' (default): ISO8601 format (e.g., '2023-10-27T10:30:00').
|
|
106
|
+
- 'epoch': Milliseconds since epoch.
|
|
107
|
+
- None: Pandas default (often Timestamps). 'iso' is generally safer for JSON.
|
|
108
|
+
csv_encoding (str, optional): Encoding for reading CSV files. Default is 'utf-8'.
|
|
109
|
+
excel_ods_engine (str | None, optional): Pandas engine for reading Excel or ODS files.
|
|
110
|
+
- For Excel: 'openpyxl' (for .xlsx), 'xlrd' (for .xls).
|
|
111
|
+
- For ODS: 'odf' (requires 'odfpy' library).
|
|
112
|
+
If None, pandas auto-detects based on file extension and installed libraries.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
bool: True if conversion was successful, False otherwise.
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
input_filepath = self.input_filepath
|
|
119
|
+
json_filepath = self.json_filepath
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
|
|
123
|
+
if not input_filepath.exists():
|
|
124
|
+
print(f"Error: Input file not found at {input_filepath}")
|
|
125
|
+
return False
|
|
126
|
+
|
|
127
|
+
# Ensure output directory exists
|
|
128
|
+
json_filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
129
|
+
|
|
130
|
+
file_suffix = input_filepath.suffix.lower()
|
|
131
|
+
output_data_final: Union[Dict[str, Any], List[Dict[str, Any]]] = {}
|
|
132
|
+
|
|
133
|
+
dataframes_to_process: list[tuple[pd.DataFrame, str | None]] = []
|
|
134
|
+
|
|
135
|
+
current_engine: PandasReadEngineType = excel_ods_engine
|
|
136
|
+
|
|
137
|
+
if file_suffix == '.csv':
|
|
138
|
+
if sheet_to_convert is not None:
|
|
139
|
+
print(f"Info: 'sheet_to_convert' parameter ('{sheet_to_convert}') is ignored for CSV file '{input_filepath.name}'. Processing entire CSV.")
|
|
140
|
+
try:
|
|
141
|
+
df = pd.read_csv(input_filepath, encoding=csv_encoding)
|
|
142
|
+
dataframes_to_process.append((df, None))
|
|
143
|
+
except Exception as e:
|
|
144
|
+
print(f"Error reading CSV file '{input_filepath.name}': {e}")
|
|
145
|
+
return False
|
|
146
|
+
|
|
147
|
+
elif file_suffix in ['.xls', '.xlsx', '.ods']:
|
|
148
|
+
try:
|
|
149
|
+
if file_suffix == '.ods':
|
|
150
|
+
if current_engine is None:
|
|
151
|
+
current_engine = 'odf'
|
|
152
|
+
elif current_engine != 'odf':
|
|
153
|
+
print(f"Warning: Specified engine '{current_engine}' may not be optimal for ODS. Forcing 'odf'.")
|
|
154
|
+
current_engine = 'odf'
|
|
155
|
+
|
|
156
|
+
if sheet_to_convert is not None:
|
|
157
|
+
df = pd.read_excel(input_filepath, sheet_name=sheet_to_convert, engine=current_engine)
|
|
158
|
+
dataframes_to_process.append((df, None))
|
|
159
|
+
|
|
160
|
+
else:
|
|
161
|
+
excel_file = pd.ExcelFile(input_filepath, engine=current_engine)
|
|
162
|
+
if not excel_file.sheet_names:
|
|
163
|
+
print(f"Warning: File '{input_filepath.name}' contains no sheets.")
|
|
164
|
+
for sheet_name in excel_file.sheet_names:
|
|
165
|
+
df = excel_file.parse(sheet_name) # engine is inherited
|
|
166
|
+
dataframes_to_process.append((df, sheet_name))
|
|
167
|
+
except ImportError as ie:
|
|
168
|
+
if 'odfpy' in str(ie).lower() and file_suffix == '.ods':
|
|
169
|
+
print(f"Error reading ODS file '{input_filepath.name}': The 'odfpy' library is required. Please install it using 'pip install odfpy'.")
|
|
170
|
+
elif 'xlrd' in str(ie).lower() and file_suffix == '.xls':
|
|
171
|
+
print(f"Error reading .xls file '{input_filepath.name}': The 'xlrd' library might be required. Please install it using 'pip install xlrd'.")
|
|
172
|
+
elif 'openpyxl' in str(ie).lower() and file_suffix == '.xlsx':
|
|
173
|
+
print(f"Error reading .xlsx file '{input_filepath.name}': The 'openpyxl' library might be required. Please install it using 'pip install openpyxl'.")
|
|
174
|
+
else:
|
|
175
|
+
print(f"ImportError reading file '{input_filepath.name}': {ie}")
|
|
176
|
+
return False
|
|
177
|
+
except Exception as e:
|
|
178
|
+
print(f"Error reading Excel/ODS file '{input_filepath.name}': {e}")
|
|
179
|
+
return False
|
|
180
|
+
else:
|
|
181
|
+
print(f"Error: Unsupported file type: '{file_suffix}'. Please provide a CSV, XLS, XLSX, or ODS file.")
|
|
182
|
+
return False
|
|
183
|
+
|
|
184
|
+
if not dataframes_to_process and file_suffix in ['.xls', '.xlsx', '.ods'] and sheet_to_convert is None:
|
|
185
|
+
print(f"Info: No dataframes were loaded from '{input_filepath.name}'. Output JSON will be empty if processing all sheets from an empty file.")
|
|
186
|
+
elif not dataframes_to_process and not (file_suffix in ['.xls', '.xlsx', '.ods'] and sheet_to_convert is None):
|
|
187
|
+
pass
|
|
188
|
+
|
|
189
|
+
is_direct_output = len(dataframes_to_process) == 1 and dataframes_to_process[0][1] is None
|
|
190
|
+
temp_processed_data: Dict[str, Any] = {}
|
|
191
|
+
|
|
192
|
+
for df_original, name_key in dataframes_to_process:
|
|
193
|
+
df = df_original.copy()
|
|
194
|
+
|
|
195
|
+
# Handle datetime columns with improved detection and conversion
|
|
196
|
+
if date_format:
|
|
197
|
+
# Check for datetime columns using multiple approaches
|
|
198
|
+
datetime_columns = []
|
|
199
|
+
|
|
200
|
+
# Method 1: Use pandas dtype detection
|
|
201
|
+
datetime_columns.extend(df.select_dtypes(include=['datetime64[ns]', 'datetime', 'datetimetz']).columns.tolist())
|
|
202
|
+
|
|
203
|
+
# Method 2: Check for datetime objects in each column
|
|
204
|
+
for col in df.columns:
|
|
205
|
+
if col not in datetime_columns:
|
|
206
|
+
# Sample a few non-null values to check type
|
|
207
|
+
sample_values = df[col].dropna().head(10)
|
|
208
|
+
if len(sample_values) > 0:
|
|
209
|
+
for val in sample_values:
|
|
210
|
+
if isinstance(val, (datetime, date, pd.Timestamp)):
|
|
211
|
+
datetime_columns.append(col)
|
|
212
|
+
break
|
|
213
|
+
|
|
214
|
+
# Convert datetime columns
|
|
215
|
+
for col_name in datetime_columns:
|
|
216
|
+
try:
|
|
217
|
+
if date_format == 'iso':
|
|
218
|
+
df[col_name] = df[col_name].apply(lambda x: self._convert_to_iso(x))
|
|
219
|
+
elif date_format == 'epoch':
|
|
220
|
+
df[col_name] = df[col_name].apply(lambda x: self._convert_to_epoch(x))
|
|
221
|
+
except Exception as e_date:
|
|
222
|
+
print(f"Warning: Could not fully convert date column '{col_name}' in '{name_key or input_filepath.name}' using format '{date_format}'. Error: {e_date}")
|
|
223
|
+
|
|
224
|
+
# Replace NaN values with None for JSON compatibility
|
|
225
|
+
df = df.astype(object).where(pd.notnull(df), None)
|
|
226
|
+
|
|
227
|
+
# Final safety check: convert any remaining datetime objects
|
|
228
|
+
for col in df.columns:
|
|
229
|
+
df[col] = df[col].apply(lambda x: self._safe_datetime_convert(x, date_format))
|
|
230
|
+
|
|
231
|
+
current_json_segment = df.to_dict(orient=orient)
|
|
232
|
+
|
|
233
|
+
if is_direct_output:
|
|
234
|
+
output_data_final = current_json_segment
|
|
235
|
+
break
|
|
236
|
+
else:
|
|
237
|
+
if name_key is not None:
|
|
238
|
+
temp_processed_data[name_key] = current_json_segment
|
|
239
|
+
|
|
240
|
+
if not is_direct_output:
|
|
241
|
+
output_data_final = temp_processed_data
|
|
242
|
+
|
|
243
|
+
with open(json_filepath, 'w', encoding='utf-8') as f:
|
|
244
|
+
json.dump(output_data_final, f, indent=4, ensure_ascii=False)
|
|
245
|
+
|
|
246
|
+
print(f"Successfully converted '{input_filepath.name}' to '{json_filepath.name}'")
|
|
247
|
+
return True
|
|
248
|
+
|
|
249
|
+
except FileNotFoundError:
|
|
250
|
+
print(f"Error: Input file not found at {input_filepath.name}")
|
|
251
|
+
return False
|
|
252
|
+
except ValueError as ve:
|
|
253
|
+
print(f"ValueError during conversion of '{input_filepath.name}': {ve}")
|
|
254
|
+
return False
|
|
255
|
+
except Exception as e:
|
|
256
|
+
print(f"An unexpected error occurred during conversion of '{input_filepath.name}': {e}")
|
|
257
|
+
return False
|
|
258
|
+
|
|
259
|
+
def _convert_to_iso(self, value):
|
|
260
|
+
"""Convert datetime-like objects to ISO format string."""
|
|
261
|
+
if pd.isnull(value) or value is None:
|
|
262
|
+
return None
|
|
263
|
+
|
|
264
|
+
try:
|
|
265
|
+
if isinstance(value, str):
|
|
266
|
+
return value # Already a string
|
|
267
|
+
elif hasattr(value, 'isoformat'):
|
|
268
|
+
return value.isoformat()
|
|
269
|
+
elif isinstance(value, pd.Timestamp):
|
|
270
|
+
return value.isoformat()
|
|
271
|
+
else:
|
|
272
|
+
return str(value)
|
|
273
|
+
except:
|
|
274
|
+
return str(value) if value is not None else None
|
|
275
|
+
|
|
276
|
+
def _convert_to_epoch(self, value):
|
|
277
|
+
"""Convert datetime-like objects to epoch milliseconds."""
|
|
278
|
+
if pd.isnull(value) or value is None:
|
|
279
|
+
return None
|
|
280
|
+
|
|
281
|
+
try:
|
|
282
|
+
if isinstance(value, (int, float)):
|
|
283
|
+
return int(value) # Assume already epoch
|
|
284
|
+
elif hasattr(value, 'timestamp'):
|
|
285
|
+
return int(value.timestamp() * 1000)
|
|
286
|
+
elif isinstance(value, pd.Timestamp):
|
|
287
|
+
return int(value.timestamp() * 1000)
|
|
288
|
+
else:
|
|
289
|
+
return str(value)
|
|
290
|
+
except:
|
|
291
|
+
return str(value) if value is not None else None
|
|
292
|
+
|
|
293
|
+
def _safe_datetime_convert(self, value, date_format):
|
|
294
|
+
"""Final safety conversion for any remaining datetime objects."""
|
|
295
|
+
if pd.isnull(value) or value is None:
|
|
296
|
+
return None
|
|
297
|
+
|
|
298
|
+
# If it's a datetime-like object, convert it
|
|
299
|
+
if isinstance(value, (datetime, date, pd.Timestamp)):
|
|
300
|
+
if date_format == 'iso':
|
|
301
|
+
return self._convert_to_iso(value)
|
|
302
|
+
elif date_format == 'epoch':
|
|
303
|
+
return self._convert_to_epoch(value)
|
|
304
|
+
else:
|
|
305
|
+
return str(value)
|
|
306
|
+
|
|
307
|
+
return value
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def chunk_text(text, chunk_size=500, overlap_size=50):
|
|
311
|
+
"""
|
|
312
|
+
Chunks a given text into smaller pieces with optional overlap.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
text (str): The input text to be chunked.
|
|
316
|
+
chunk_size (int): The maximum size of each chunk (in characters).
|
|
317
|
+
overlap_size (int): The number of characters to overlap between consecutive chunks.
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
list: A list of text chunks.
|
|
321
|
+
"""
|
|
322
|
+
chunks = []
|
|
323
|
+
start = 0
|
|
324
|
+
while start < len(text):
|
|
325
|
+
end = start + chunk_size
|
|
326
|
+
chunk = text[start:end]
|
|
327
|
+
chunks.append(chunk)
|
|
328
|
+
start += (chunk_size - overlap_size)
|
|
329
|
+
if start < 0: # Handle cases where overlap_size is greater than chunk_size
|
|
330
|
+
start = 0
|
|
331
|
+
return chunks
|
|
332
|
+
|
|
333
|
+
if __name__ == '__main__':
|
|
334
|
+
print("Test Run Start:")
|
|
335
|
+
try:
|
|
336
|
+
# print("Test 1: scaned pdf page, bytes")
|
|
337
|
+
# with open("/home/cyto/Documents/scanned.pdf", "rb") as imgpdf:
|
|
338
|
+
# conv= Convertor(file_bytes= imgpdf.read(), suffix= ".pdf", file_type= "pdf")
|
|
339
|
+
# print(conv.output)
|
|
340
|
+
|
|
341
|
+
# print("Test 2: JD pdf, bytes")
|
|
342
|
+
# with open("/home/cyto/dev/pembotdir/jds/PM Trainee.pdf", "rb") as imgpdf:
|
|
343
|
+
# conv= Convertor(file_bytes= imgpdf.read(), suffix= ".pdf", file_type= "pdf")
|
|
344
|
+
# print(conv.output)
|
|
345
|
+
|
|
346
|
+
# print("Test 3: excel schedule, bytes")
|
|
347
|
+
# with open("/home/cyto/Downloads/Assignment schedule.xlsx", "rb") as imgpdf:
|
|
348
|
+
# conv= Convertor(file_bytes= imgpdf.read(), suffix= ".xlsx", file_type= "excel")
|
|
349
|
+
# print(conv.output)
|
|
350
|
+
|
|
351
|
+
# without bytes example:
|
|
352
|
+
print("Test 4: scanned pdf, path")
|
|
353
|
+
conv= Convertor(myfile= Path('/home/cyto/Documents/scanned.pdf'), output_dir= Path('/home/cyto/Documents'))
|
|
354
|
+
print(conv.output)
|
|
355
|
+
|
|
356
|
+
# print("Test 5: schedule excel, path")
|
|
357
|
+
# conv= Convertor(myfile= Path('/home/cyto/Downloads/Assignment schedule.xlsx'), output_dir= Path('/home/cyto/Downloads'))
|
|
358
|
+
# print(conv.output)
|
|
359
|
+
except FileNotFoundError as fe:
|
|
360
|
+
print("file not found, modify the driver code to get sample files to test:\n\n", fe)
|
|
361
|
+
except Exception as e:
|
|
362
|
+
print("unhandled: ", e)
|
|
363
|
+
|
|
364
|
+
print("Test Run End.")
|
|
@@ -10,7 +10,11 @@ from pembot.query import rag_query_llm, remove_bs
|
|
|
10
10
|
import os
|
|
11
11
|
import json
|
|
12
12
|
from pembot.utils.string_tools import make_it_an_id
|
|
13
|
-
|
|
13
|
+
import pickle
|
|
14
|
+
from sys import argv
|
|
15
|
+
|
|
16
|
+
required_fields_path= ""
|
|
17
|
+
required_fields= None
|
|
14
18
|
|
|
15
19
|
|
|
16
20
|
def make_query(required_fields: list[tuple[str, str, str, str]]):
|
|
@@ -67,8 +71,8 @@ def save_to_json_file(llm_output: str, filepath: Path):
|
|
|
67
71
|
except Exception as e:
|
|
68
72
|
print(f"An unexpected error occurred in save_to_json_file: {e}")
|
|
69
73
|
|
|
70
|
-
def make_document_summarization_and_embeddings(db_client, llm_client, inference_client, docs_dir: Path, text_out_dir: Path, required_fields: list[tuple[str, str, str, str]], chunk_size: int = 600, embedding_model: str= 'nomic-embed-text:v1.5', llm_provider_name: PROVIDER_T= "novita", model_name= "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", embeddings_collection: str= "doc_chunks", index_name= "test_search"):
|
|
71
|
-
# give required output fields
|
|
74
|
+
def make_document_summarization_and_embeddings(db_client, llm_client, inference_client, docs_dir: Path, text_out_dir: Path, required_fields: list[tuple[str, str, str, str]], chunk_size: int = 600, embedding_model: str= 'nomic-embed-text:v1.5', llm_provider_name: PROVIDER_T= "novita", model_name= "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", embeddings_collection: str= "doc_chunks", index_name= "test_search"):
|
|
75
|
+
# give required output fields
|
|
72
76
|
# take the documents
|
|
73
77
|
# convert to text
|
|
74
78
|
# upload to chromadb
|
|
@@ -80,7 +84,7 @@ def make_document_summarization_and_embeddings(db_client, llm_client, inference_
|
|
|
80
84
|
expected_json= text_out_dir / 'json' / (file_root + '.json')
|
|
81
85
|
document_id= make_it_an_id(file_root)
|
|
82
86
|
|
|
83
|
-
if docfile.is_file and not (expected_json).exists():
|
|
87
|
+
if docfile.is_file and not (expected_json).exists():
|
|
84
88
|
|
|
85
89
|
expected_markdown= text_out_dir / (file_root + '.md')
|
|
86
90
|
if not (expected_markdown).exists():
|
|
@@ -161,6 +165,23 @@ if __name__ == "__main__":
|
|
|
161
165
|
# provider="Jina AI",
|
|
162
166
|
# api_key= JINA_API_KEY,
|
|
163
167
|
# )
|
|
168
|
+
#
|
|
169
|
+
|
|
170
|
+
try:
|
|
171
|
+
if len(argv) > 1:
|
|
172
|
+
print(f"First argument: {argv[1]}")
|
|
173
|
+
required_fields_path= argv[1]
|
|
174
|
+
with open(required_fields_path, "rb") as rf:
|
|
175
|
+
required_fields= pickle.load(rf)
|
|
176
|
+
except Exception as e:
|
|
177
|
+
print("error while getting required_fields pickle. Please pickle it and put it in project directory to continue\n", e)
|
|
178
|
+
|
|
179
|
+
if required_fields is None:
|
|
180
|
+
print("couldnt load required fields. please provide path to pickle in command line argument")
|
|
181
|
+
exit()
|
|
182
|
+
else:
|
|
183
|
+
print(required_fields)
|
|
184
|
+
|
|
164
185
|
|
|
165
186
|
inference_client= InferenceClient(
|
|
166
187
|
provider="hf-inference",
|
|
@@ -178,7 +199,7 @@ if __name__ == "__main__":
|
|
|
178
199
|
llm_provider_name: PROVIDER_T="nebius"
|
|
179
200
|
|
|
180
201
|
# nerfed, but provided by hf serverless inference: BAAI/bge-small-en-v1.5
|
|
181
|
-
# Worth mentioning:
|
|
202
|
+
# Worth mentioning:
|
|
182
203
|
# jinaai/jina-embeddings-v3
|
|
183
204
|
# BAAI/bge-base-en-v1.5
|
|
184
205
|
# nomic-ai/nomic-embed-text-v1.5
|
|
@@ -203,6 +224,3 @@ if __name__ == "__main__":
|
|
|
203
224
|
|
|
204
225
|
docs_collection= database["summary_docs"]
|
|
205
226
|
upload_summaries(process_output_dir / 'json', docs_collection)
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|