pembot 0.0.5__tar.gz → 0.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pembot might be problematic. Click here for more details.
- {pembot-0.0.5 → pembot-0.0.6}/PKG-INFO +1 -1
- pembot-0.0.6/pembot/.git/COMMIT_EDITMSG +1 -0
- pembot-0.0.6/pembot/.git/index +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/logs/HEAD +1 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/logs/refs/heads/main +1 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/logs/refs/remotes/origin/main +1 -0
- pembot-0.0.6/pembot/.git/objects/3e/23850624fcf5f111d6ea88ddd64adf924cf82f +0 -0
- pembot-0.0.6/pembot/.git/objects/4d/a03134f70896f72053fbdc0cd4f4c76d4ac1d8 +0 -0
- pembot-0.0.6/pembot/.git/objects/95/28bbccd167e3f4ad583a1ae9fac98a52620e27 +0 -0
- pembot-0.0.6/pembot/.git/objects/bd/8fd1cb166996e74a8631f3a6f764a53af75297 +0 -0
- pembot-0.0.6/pembot/.git/objects/bf/518686b06069d2a8abd3689908b7e1a6e16b05 +0 -0
- pembot-0.0.6/pembot/.git/objects/e0/9162dbd64d85bb5ed740aa99faefa73f293d78 +0 -0
- pembot-0.0.6/pembot/.git/refs/heads/main +1 -0
- pembot-0.0.6/pembot/.git/refs/remotes/origin/main +1 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/AnyToText/convertor.py +8 -6
- {pembot-0.0.5 → pembot-0.0.6}/pembot/__init__.py +1 -1
- {pembot-0.0.5 → pembot-0.0.6}/pembot/config/config.yaml +1 -1
- pembot-0.0.6/pembot/pdf2markdown/.git/COMMIT_EDITMSG +1 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/config +3 -0
- pembot-0.0.6/pembot/pdf2markdown/.git/index +0 -0
- pembot-0.0.6/pembot/pdf2markdown/.git/logs/HEAD +4 -0
- pembot-0.0.6/pembot/pdf2markdown/.git/logs/refs/heads/main +4 -0
- pembot-0.0.6/pembot/pdf2markdown/.git/logs/refs/remotes/myorigin/main +3 -0
- pembot-0.0.6/pembot/pdf2markdown/.git/objects/14/251b198e0bac39a3dc3b42f9e57b20c01465fb +0 -0
- pembot-0.0.6/pembot/pdf2markdown/.git/objects/24/8f03b5f969a7fbd396b496f40b57f0ae81c148 +0 -0
- pembot-0.0.6/pembot/pdf2markdown/.git/objects/57/74dc9c3901d2ffb2cd7dafe2ad6612a7f9f42c +0 -0
- pembot-0.0.6/pembot/pdf2markdown/.git/objects/72/2dc14f82e78ce41717348b256e0c17834933b4 +0 -0
- pembot-0.0.6/pembot/pdf2markdown/.git/objects/79/eb7b93ced70e399bd561093c45de7641414dbd +0 -0
- pembot-0.0.6/pembot/pdf2markdown/.git/objects/8d/9ce1fd9733a78c592b34af9c94b98960c601ed +0 -0
- pembot-0.0.6/pembot/pdf2markdown/.git/objects/95/745843bb4377d6042180daeda818c0b16fd493 +0 -0
- pembot-0.0.6/pembot/pdf2markdown/.git/objects/a5/c6dfb577782c259990dcf977e355298e923428 +0 -0
- pembot-0.0.6/pembot/pdf2markdown/.git/objects/b4/8d697aa9fd97151eb2a84a1af5d408b7630232 +0 -0
- pembot-0.0.6/pembot/pdf2markdown/.git/objects/b8/702320e56074e9680181d8b7897d6a0a552e2d +0 -0
- pembot-0.0.6/pembot/pdf2markdown/.git/objects/e6/9de29bb2d1d6434b8b29ae775ad8c2e48c5391 +0 -0
- pembot-0.0.6/pembot/pdf2markdown/.git/refs/heads/main +1 -0
- pembot-0.0.6/pembot/pdf2markdown/.git/refs/remotes/myorigin/main +1 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/extract.py +58 -90
- pembot-0.0.6/pembot/pdf2markdown/pyrightconfig.json +4 -0
- pembot-0.0.6/pembot/requirements.txt +80 -0
- pembot-0.0.5/pembot/.git/COMMIT_EDITMSG +0 -1
- pembot-0.0.5/pembot/.git/index +0 -0
- pembot-0.0.5/pembot/.git/refs/heads/main +0 -1
- pembot-0.0.5/pembot/.git/refs/remotes/origin/main +0 -1
- pembot-0.0.5/pembot/pdf2markdown/.git/index +0 -0
- pembot-0.0.5/pembot/pdf2markdown/.git/logs/refs/heads/main +0 -1
- pembot-0.0.5/pembot/pdf2markdown/.git/logs/refs/remotes/origin/HEAD +0 -1
- pembot-0.0.5/pembot/pdf2markdown/.git/refs/heads/main +0 -1
- {pembot-0.0.5 → pembot-0.0.6}/LICENSE +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/HEAD +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/config +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/description +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/hooks/applypatch-msg.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/hooks/commit-msg.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/hooks/fsmonitor-watchman.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/hooks/post-update.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/hooks/pre-applypatch.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/hooks/pre-commit.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/hooks/pre-merge-commit.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/hooks/pre-push.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/hooks/pre-rebase.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/hooks/pre-receive.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/hooks/prepare-commit-msg.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/hooks/push-to-checkout.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/hooks/sendemail-validate.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/hooks/update.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/info/exclude +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/logs/refs/remotes/origin/HEAD +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/0a/fb3a98cdc55b1434b44534ec2bf22c56cfa26c +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/0b/db4169fc0f312b8698f1df17a258fff163aeaa +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/0c/8d9b2690545bf1906b05cd9f18b783b3eb74f1 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/18/28e18ab80aa64d334b26428708140e280cbc63 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/19/f61df7dbd562d04f561288677bbf2f18f5dff7 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/1f/83a471c8119f7794d98c049170a5d7d07a4b71 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/28/db0ab48059acccd7d257aa02e52e9b6b83a4a5 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/35/97e518a8658280be9f377f78edf1dfa1f23814 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/3d/07d3b29ff53d95de3898fb786d61732f210515 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/3e/cf23eb95123287531d708a21d4ba88d92ccabb +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/3f/78215d7e17da726fb352fd92b3c117db9b63ba +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/3f/e072cf3cb6a9f30c3e9936e3ddf622e80270d0 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/41/cbeb6bcb4c6fa9ef9be571082d95ecb4ea0ee3 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/51/9e780574933d7627a083222bd10dd74f430904 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/61/46a371b9c1bd9f51af273f11f986cfd1bedeba +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/63/1700a51c8fa97b543991f5f61bfcd1e7e1327d +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/64/00040794955d17c9a1fe1aaaea59f2c4822177 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/6d/7a865a23b1cb4182f67907820104ced48b11c9 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/72/f047cda92abcd1ddc857f6461de605f8668331 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/73/2e98f08bc806c331b06847fc8c743f545499e5 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/86/cdaec229f1fbebf43042266b03878944669f25 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/87/d6df5217a4a374f8c1211a05f9bd657f72c9a7 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/8b/5be2af9b16f290549193859c214cd9072212e8 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/93/8f29d9b4b1ae86e39dddf9e3d115a82ddfc9b6 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/9b/123713e30fc9e225f9ac8ff5b02f8f8cf86456 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/ab/139d2cd4798dd8e2c565b80440b1a44b376126 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/ab/c6b15265171457b41e2cfdaf3b8c3994a59eb7 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/ac/9c9018c62fa30dc142665c1b5a375f4e056880 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/b1/1173d9b68db117437ccb9551461152e1e8a77d +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/b2/4e79ab07fe9e68781961a25ff9f1dbb1546fbb +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/b8/eea52176ffa4d88c5a9976bee26092421565d3 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/bf/068a0714e2145de83a5c004f4213b091439d0e +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/bf/32a7e6872e5dc4025ee3df3c921ec7ade0855f +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/c0/793458db6e1bee7f79f1a504fb8ff4963f8ed3 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/c2/443060c07101948487cfa93cc39e082e9e0f5f +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/d0/937f7d832266337289d5ec09459f931a46fcf7 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/e5/3070f2b07f45d031444b09b1b38658f3caf29e +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/e7/911a702079a6144997ea4e70f59abbe59ec2bc +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/e9/1172752e9a421ae463112d2b0506b37498c98d +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/ea/0af89e61a882c5afc2a8c281b2d96f174bfe58 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/eb/75e1c49f1e5b79dca17ccdbec8067756523238 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/f1/655afa1c5636c8d58969e3194bb770aefbc552 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/f4/e991088a63def67a30a2b8bbdb4d58514abab8 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/f8/cbb5bfd1503e66cec2c593362c60a317b6d300 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/f9/98e1f01c2bf0a20159fc851327af05beb3ac88 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/fa/9c9a62ec1203a5868b033ded428c2382c4e1b6 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/fb/6c90c9ce5e0cdfbe074a3f060afc66f62eefde +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/fc/988aab7e2d46396dc595ad24345e8e77dda0e4 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/fc/e56f1e09d09a05b9babf796fb40bece176f3a2 +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.idx +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.pack +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.rev +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/packed-refs +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.git/refs/remotes/origin/HEAD +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/.gitignore +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/AnyToText/__init__.py +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/LICENSE +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/TextEmbedder/__init__.py +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/TextEmbedder/gemini_embedder.py +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/TextEmbedder/mongodb_embedder.py +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/TextEmbedder/mongodb_index_creator.py +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/TextEmbedder/vector_query.py +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/gartner.py +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/main.py +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/output_structure_local.py +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/HEAD +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/description +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/hooks/applypatch-msg.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/hooks/commit-msg.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/hooks/fsmonitor-watchman.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/hooks/post-update.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/hooks/pre-applypatch.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/hooks/pre-commit.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/hooks/pre-merge-commit.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/hooks/pre-push.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/hooks/pre-rebase.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/hooks/pre-receive.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/hooks/prepare-commit-msg.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/hooks/push-to-checkout.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/hooks/sendemail-validate.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/hooks/update.sample +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/info/exclude +0 -0
- {pembot-0.0.5/pembot/pdf2markdown/.git/logs → pembot-0.0.6/pembot/pdf2markdown/.git/logs/refs/remotes/origin}/HEAD +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.idx +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.pack +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.rev +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/packed-refs +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/.git/refs/remotes/origin/HEAD +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/LICENSE +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/README.md +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/__init__.py +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/config/config.yaml +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pdf2markdown/requirements.txt +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/pem.py +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/query.py +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/utils/__init__.py +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/utils/inference_client.py +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pembot/utils/string_tools.py +0 -0
- {pembot-0.0.5 → pembot-0.0.6}/pyproject.toml +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
handled config loading errors gracefully; added gemini support, as an option; added huggingface nanonets transformers support (as an option); redesigned the extract markdown for captioning and image ocr (block image and full-page image);
|
|
Binary file
|
|
@@ -5,3 +5,4 @@ ac9c9018c62fa30dc142665c1b5a375f4e056880 72f047cda92abcd1ddc857f6461de605f866833
|
|
|
5
5
|
e91172752e9a421ae463112d2b0506b37498c98d 0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 cyto <silverstone965@gmail.com> 1749716350 +0530 commit: added a pem blog chunking module for updating from local, and, an embedding loop to embed all the blogs, with document id as the filter in the search, and the first line title as the filter in updation
|
|
6
6
|
0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 eb75e1c49f1e5b79dca17ccdbec8067756523238 cyto <silverstone965@gmail.com> 1750856653 +0530 commit: made arrangements for the cases when custom file bytes are to be processed to text output; handled a ollama running / crashing error
|
|
7
7
|
eb75e1c49f1e5b79dca17ccdbec8067756523238 0bdb4169fc0f312b8698f1df17a258fff163aeaa cyto <silverstone965@gmail.com> 1750937276 +0530 commit: fixed the output_dir bug; fixed the excel to json function; ran some tests on convertor; incremented the version on the package; removed dependency on schema / structure, and shifted required fields to a pickle file path in the cli args;
|
|
8
|
+
0bdb4169fc0f312b8698f1df17a258fff163aeaa 9528bbccd167e3f4ad583a1ae9fac98a52620e27 cyto <silverstone965@gmail.com> 1750947488 +0530 commit: handled local llm nonexistent error properly for choice of just passing None as llm_client;
|
|
@@ -5,3 +5,4 @@ ac9c9018c62fa30dc142665c1b5a375f4e056880 72f047cda92abcd1ddc857f6461de605f866833
|
|
|
5
5
|
e91172752e9a421ae463112d2b0506b37498c98d 0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 cyto <silverstone965@gmail.com> 1749716350 +0530 commit: added a pem blog chunking module for updating from local, and, an embedding loop to embed all the blogs, with document id as the filter in the search, and the first line title as the filter in updation
|
|
6
6
|
0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 eb75e1c49f1e5b79dca17ccdbec8067756523238 cyto <silverstone965@gmail.com> 1750856653 +0530 commit: made arrangements for the cases when custom file bytes are to be processed to text output; handled a ollama running / crashing error
|
|
7
7
|
eb75e1c49f1e5b79dca17ccdbec8067756523238 0bdb4169fc0f312b8698f1df17a258fff163aeaa cyto <silverstone965@gmail.com> 1750937276 +0530 commit: fixed the output_dir bug; fixed the excel to json function; ran some tests on convertor; incremented the version on the package; removed dependency on schema / structure, and shifted required fields to a pickle file path in the cli args;
|
|
8
|
+
0bdb4169fc0f312b8698f1df17a258fff163aeaa 9528bbccd167e3f4ad583a1ae9fac98a52620e27 cyto <silverstone965@gmail.com> 1750947488 +0530 commit: handled local llm nonexistent error properly for choice of just passing None as llm_client;
|
|
@@ -4,3 +4,4 @@ ac9c9018c62fa30dc142665c1b5a375f4e056880 72f047cda92abcd1ddc857f6461de605f866833
|
|
|
4
4
|
e91172752e9a421ae463112d2b0506b37498c98d 0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 cyto <silverstone965@gmail.com> 1749716371 +0530 update by push
|
|
5
5
|
0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 eb75e1c49f1e5b79dca17ccdbec8067756523238 cyto <silverstone965@gmail.com> 1750856672 +0530 update by push
|
|
6
6
|
eb75e1c49f1e5b79dca17ccdbec8067756523238 0bdb4169fc0f312b8698f1df17a258fff163aeaa cyto <silverstone965@gmail.com> 1750937389 +0530 update by push
|
|
7
|
+
0bdb4169fc0f312b8698f1df17a258fff163aeaa 9528bbccd167e3f4ad583a1ae9fac98a52620e27 cyto <silverstone965@gmail.com> 1750947502 +0530 update by push
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
9528bbccd167e3f4ad583a1ae9fac98a52620e27
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
9528bbccd167e3f4ad583a1ae9fac98a52620e27
|
|
@@ -35,6 +35,8 @@ class Convertor():
|
|
|
35
35
|
|
|
36
36
|
self.output= ""
|
|
37
37
|
|
|
38
|
+
# model_name= "gemini-2.5-flash"
|
|
39
|
+
model_name= None
|
|
38
40
|
# file_type can be pdf, excel, etc.
|
|
39
41
|
if output_dir is None and myfile is None and file_bytes is not None and suffix is not None:
|
|
40
42
|
with tempfile.TemporaryDirectory() as dp:
|
|
@@ -43,7 +45,7 @@ class Convertor():
|
|
|
43
45
|
myfile= Path(fp.name)
|
|
44
46
|
output_dir= Path(dp)
|
|
45
47
|
if file_type == 'pdf':
|
|
46
|
-
extractor= MarkdownPDFExtractor(str(myfile), output_path= str(output_dir), page_delimiter= "-- NEXT PAGE --")
|
|
48
|
+
extractor= MarkdownPDFExtractor(str(myfile), output_path= str(output_dir), page_delimiter= "-- NEXT PAGE --", model_name= model_name)
|
|
47
49
|
extractor.extract()
|
|
48
50
|
with open(output_dir / (myfile.stem + '.md')) as output_file:
|
|
49
51
|
self.output= output_file.read()
|
|
@@ -67,7 +69,7 @@ class Convertor():
|
|
|
67
69
|
print("the file was json")
|
|
68
70
|
elif mt == 'application/pdf':
|
|
69
71
|
print("the file was pdf, outputting in: ", output_dir)
|
|
70
|
-
extractor= MarkdownPDFExtractor(str(myfile), output_path= str(self.output_dir), page_delimiter= "-- NEXT PAGE --")
|
|
72
|
+
extractor= MarkdownPDFExtractor(str(myfile), output_path= str(self.output_dir), page_delimiter= "-- NEXT PAGE --", model_name= model_name)
|
|
71
73
|
extractor.extract()
|
|
72
74
|
|
|
73
75
|
elif mt in EXCEL_FILE_TYPES:
|
|
@@ -333,10 +335,10 @@ def chunk_text(text, chunk_size=500, overlap_size=50):
|
|
|
333
335
|
if __name__ == '__main__':
|
|
334
336
|
print("Test Run Start:")
|
|
335
337
|
try:
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
338
|
+
print("Test 1: scaned pdf page, bytes")
|
|
339
|
+
with open("/home/cyto/Documents/scanned.pdf", "rb") as imgpdf:
|
|
340
|
+
conv= Convertor(file_bytes= imgpdf.read(), suffix= ".pdf", file_type= "pdf")
|
|
341
|
+
print(conv.output)
|
|
340
342
|
|
|
341
343
|
# print("Test 2: JD pdf, bytes")
|
|
342
344
|
# with open("/home/cyto/dev/pembotdir/jds/PM Trainee.pdf", "rb") as imgpdf:
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
cyto/argument-list-bug-fix;authentication-used-in-gradio-client
|
|
Binary file
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
0000000000000000000000000000000000000000 ffb759ee4605b232366a9ee58134532913c3f9e0 cyto <cyto@callisto.localdomain> 1747745478 +0530 clone: from https://github.com/iamarunbrahma/pdf-to-markdown
|
|
2
|
+
ffb759ee4605b232366a9ee58134532913c3f9e0 b8702320e56074e9680181d8b7897d6a0a552e2d cyto <silverstone965@gmail.com> 1750947962 +0530 commit: handled config loading errors gracefully; added gemini support, as an option; added huggingface nanonets transformers support (as an option); redesigned the extract markdown for captioning and image ocr (block image and full-page image);
|
|
3
|
+
b8702320e56074e9680181d8b7897d6a0a552e2d 14251b198e0bac39a3dc3b42f9e57b20c01465fb cyto <silverstone965@gmail.com> 1751604763 +0530 commit: removed deps on torch and transformers; used gradio client for ocr through public spaces;
|
|
4
|
+
14251b198e0bac39a3dc3b42f9e57b20c01465fb b48d697aa9fd97151eb2a84a1af5d408b7630232 cyto <silverstone965@gmail.com> 1751871887 +0530 commit: cyto/argument-list-bug-fix;authentication-used-in-gradio-client
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
0000000000000000000000000000000000000000 ffb759ee4605b232366a9ee58134532913c3f9e0 cyto <cyto@callisto.localdomain> 1747745478 +0530 clone: from https://github.com/iamarunbrahma/pdf-to-markdown
|
|
2
|
+
ffb759ee4605b232366a9ee58134532913c3f9e0 b8702320e56074e9680181d8b7897d6a0a552e2d cyto <silverstone965@gmail.com> 1750947962 +0530 commit: handled config loading errors gracefully; added gemini support, as an option; added huggingface nanonets transformers support (as an option); redesigned the extract markdown for captioning and image ocr (block image and full-page image);
|
|
3
|
+
b8702320e56074e9680181d8b7897d6a0a552e2d 14251b198e0bac39a3dc3b42f9e57b20c01465fb cyto <silverstone965@gmail.com> 1751604763 +0530 commit: removed deps on torch and transformers; used gradio client for ocr through public spaces;
|
|
4
|
+
14251b198e0bac39a3dc3b42f9e57b20c01465fb b48d697aa9fd97151eb2a84a1af5d408b7630232 cyto <silverstone965@gmail.com> 1751871887 +0530 commit: cyto/argument-list-bug-fix;authentication-used-in-gradio-client
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
0000000000000000000000000000000000000000 b8702320e56074e9680181d8b7897d6a0a552e2d cyto <silverstone965@gmail.com> 1750948073 +0530 update by push
|
|
2
|
+
b8702320e56074e9680181d8b7897d6a0a552e2d 14251b198e0bac39a3dc3b42f9e57b20c01465fb cyto <silverstone965@gmail.com> 1751604904 +0530 update by push
|
|
3
|
+
14251b198e0bac39a3dc3b42f9e57b20c01465fb b48d697aa9fd97151eb2a84a1af5d408b7630232 cyto <silverstone965@gmail.com> 1751872077 +0530 update by push
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
b48d697aa9fd97151eb2a84a1af5d408b7630232
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
b48d697aa9fd97151eb2a84a1af5d408b7630232
|
|
@@ -2,11 +2,9 @@ import fitz
|
|
|
2
2
|
import pdfplumber
|
|
3
3
|
import re
|
|
4
4
|
import yaml
|
|
5
|
-
|
|
5
|
+
import pytesseract
|
|
6
6
|
import numpy as np
|
|
7
|
-
from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText, VisionEncoderDecoderModel, ViTImageProcessor
|
|
8
7
|
from typing import Literal, final
|
|
9
|
-
import torch
|
|
10
8
|
from PIL import Image
|
|
11
9
|
import os
|
|
12
10
|
import logging
|
|
@@ -19,6 +17,9 @@ import io
|
|
|
19
17
|
from google import genai
|
|
20
18
|
from google.genai import types
|
|
21
19
|
import mimetypes
|
|
20
|
+
from gradio_client import Client, handle_file
|
|
21
|
+
import gradio as gr
|
|
22
|
+
import tempfile
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
|
|
@@ -75,25 +76,18 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
75
76
|
super().__init__(pdf_path)
|
|
76
77
|
|
|
77
78
|
if model_name is None:
|
|
78
|
-
self.MODEL_NAME= "gemini-2.5-flash"
|
|
79
|
+
# self.MODEL_NAME= "gemini-2.5-flash"
|
|
80
|
+
self.MODEL_NAME= "Nanonets-OCR-s"
|
|
79
81
|
else:
|
|
80
82
|
self.MODEL_NAME= model_name
|
|
81
83
|
|
|
82
84
|
if "gemini" in self.MODEL_NAME:
|
|
83
85
|
self.gclient = genai.Client(api_key= os.getenv("GEMINI_API_KEY", ''))
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
self.model = AutoModelForImageTextToText.from_pretrained(
|
|
87
|
-
model_path,
|
|
88
|
-
torch_dtype="auto",
|
|
89
|
-
device_map="auto",
|
|
90
|
-
attn_implementation="flash_attention_2"
|
|
91
|
-
)
|
|
92
|
-
self.model.eval()
|
|
93
|
-
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
|
|
94
|
-
self.processor = AutoProcessor.from_pretrained(model_path)
|
|
95
|
-
self.setup_image_captioning()
|
|
86
|
+
elif "anonet" in self.MODEL_NAME:
|
|
87
|
+
# self.nclient= Client("prithivMLmods/Multimodal-OCR2")
|
|
96
88
|
|
|
89
|
+
# zerogpu public
|
|
90
|
+
self.nclient= Client("deepak-mehta/ocr-simplify", hf_token= os.getenv('HF_TOKEN', ''))
|
|
97
91
|
|
|
98
92
|
|
|
99
93
|
self.markdown_content= ""
|
|
@@ -108,25 +102,6 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
108
102
|
|
|
109
103
|
|
|
110
104
|
|
|
111
|
-
def setup_image_captioning(self):
|
|
112
|
-
"""Set up the image captioning model."""
|
|
113
|
-
try:
|
|
114
|
-
self.model = VisionEncoderDecoderModel.from_pretrained(
|
|
115
|
-
"nlpconnect/vit-gpt2-image-captioning"
|
|
116
|
-
)
|
|
117
|
-
self.feature_extractor = ViTImageProcessor.from_pretrained(
|
|
118
|
-
"nlpconnect/vit-gpt2-image-captioning"
|
|
119
|
-
)
|
|
120
|
-
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
121
|
-
"nlpconnect/vit-gpt2-image-captioning"
|
|
122
|
-
)
|
|
123
|
-
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
124
|
-
self.model.to(self.device)
|
|
125
|
-
self.logger.info("Image captioning model set up successfully.")
|
|
126
|
-
except Exception as e:
|
|
127
|
-
self.logger.error(f"Error setting up image captioning model: {e}")
|
|
128
|
-
self.logger.exception(traceback.format_exc())
|
|
129
|
-
|
|
130
105
|
def extract(self):
|
|
131
106
|
try:
|
|
132
107
|
markdown_content, markdown_pages = self.extract_markdown()
|
|
@@ -143,12 +118,18 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
143
118
|
return "", []
|
|
144
119
|
|
|
145
120
|
|
|
146
|
-
def
|
|
147
|
-
|
|
121
|
+
def image_ocr(self, pil_image, img_bytes, max_new_tokens: int | None = None, prompt: str | None= None):
|
|
122
|
+
if prompt is None:
|
|
123
|
+
prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""
|
|
148
124
|
if max_new_tokens is None:
|
|
149
125
|
max_new_tokens= 4096
|
|
150
126
|
|
|
151
|
-
|
|
127
|
+
w, h= pil_image.size
|
|
128
|
+
if w < 200 or h < 50:
|
|
129
|
+
return "<img> A small image </img>"
|
|
130
|
+
|
|
131
|
+
model_name= self.MODEL_NAME.lower()
|
|
132
|
+
if 'gemini' in model_name:
|
|
152
133
|
|
|
153
134
|
image_format = pil_image.format
|
|
154
135
|
dummy_filename = f"dummy.{image_format.lower()}"
|
|
@@ -165,24 +146,40 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
165
146
|
)
|
|
166
147
|
# print("response :", response)
|
|
167
148
|
return response.text
|
|
168
|
-
|
|
169
|
-
image = pil_image
|
|
170
|
-
messages = [
|
|
171
|
-
{"role": "system", "content": "You are a helpful assistant."},
|
|
172
|
-
{"role": "user", "content": [
|
|
173
|
-
{"type": "image", "image": image},
|
|
174
|
-
{"type": "text", "text": prompt},
|
|
175
|
-
]},
|
|
176
|
-
]
|
|
177
|
-
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
178
|
-
inputs = self.processor(text=[text], images=[image], padding=True, return_tensors="pt")
|
|
179
|
-
inputs = inputs.to(self.model.device)
|
|
149
|
+
elif 'nanonet' in model_name:
|
|
180
150
|
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
151
|
+
result= ""
|
|
152
|
+
try:
|
|
153
|
+
with tempfile.NamedTemporaryFile(suffix=f'.{pil_image.format.lower()}', mode= 'w') as temp_file:
|
|
154
|
+
pil_image.save(temp_file.name)
|
|
155
|
+
print("file name: ", temp_file.name)
|
|
156
|
+
gr_image= handle_file(temp_file.name)
|
|
157
|
+
print("gr image : ", gr_image)
|
|
158
|
+
result = self.nclient.predict(
|
|
159
|
+
# model_name="Nanonets-OCR-s",
|
|
160
|
+
# text= prompt,
|
|
161
|
+
gr_image,
|
|
162
|
+
# max_new_tokens=max_new_tokens,
|
|
163
|
+
# temperature=0.6,
|
|
164
|
+
# top_p=0.9,
|
|
165
|
+
# top_k=50,
|
|
166
|
+
# repetition_penalty=1.2,
|
|
167
|
+
|
|
168
|
+
# prithiv model
|
|
169
|
+
# api_name="/generate_image"
|
|
170
|
+
|
|
171
|
+
max_new_tokens,
|
|
172
|
+
|
|
173
|
+
# spaces zerogpu
|
|
174
|
+
api_name="/predict"
|
|
175
|
+
)
|
|
176
|
+
print("ocr'd: ", result[:100] + "...")
|
|
177
|
+
except Exception as e:
|
|
178
|
+
print("Error during nanonet inference", e)
|
|
179
|
+
|
|
180
|
+
return result
|
|
181
|
+
else:
|
|
182
|
+
return pytesseract.image_to_string(pil_image)
|
|
186
183
|
|
|
187
184
|
|
|
188
185
|
|
|
@@ -219,7 +216,7 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
219
216
|
for page_num, page in enumerate(doc):
|
|
220
217
|
current_page_markdown_blocks = [] # Collect markdown blocks for the current page
|
|
221
218
|
page_has_searchable_text = False
|
|
222
|
-
page_has_embedded_images = False
|
|
219
|
+
# page_has_embedded_images = False
|
|
223
220
|
|
|
224
221
|
self.logger.info(f"\nProcessing page {page_num + 1}...")
|
|
225
222
|
|
|
@@ -252,7 +249,7 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
252
249
|
try:
|
|
253
250
|
image_bytes= io.BytesIO(img_data)
|
|
254
251
|
pil_image = Image.open(image_bytes)
|
|
255
|
-
ocr_text_from_block_image = self.
|
|
252
|
+
ocr_text_from_block_image = self.image_ocr(
|
|
256
253
|
pil_image, image_bytes, max_new_tokens=15000
|
|
257
254
|
)
|
|
258
255
|
|
|
@@ -292,7 +289,7 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
292
289
|
image_bytestream= io.BytesIO(img_bytes)
|
|
293
290
|
pil_image = Image.open(image_bytestream)
|
|
294
291
|
|
|
295
|
-
ocr_text_from_page = self.
|
|
292
|
+
ocr_text_from_page = self.image_ocr(
|
|
296
293
|
pil_image, image_bytestream, max_new_tokens=15000
|
|
297
294
|
)
|
|
298
295
|
|
|
@@ -389,7 +386,7 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
389
386
|
# ocr_result = pytesseract.image_to_string(
|
|
390
387
|
# image
|
|
391
388
|
# )
|
|
392
|
-
ocr_result= self.
|
|
389
|
+
ocr_result= self.image_ocr(image, image_bytes, max_new_tokens=15000)
|
|
393
390
|
|
|
394
391
|
|
|
395
392
|
return ocr_result.strip()
|
|
@@ -409,38 +406,9 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
409
406
|
if image.mode != "RGB":
|
|
410
407
|
image = image.convert("RGB")
|
|
411
408
|
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
mime_type, _ = mimetypes.guess_type(dummy_filename)
|
|
415
|
-
|
|
416
|
-
if "gemini" in self.MODEL_NAME:
|
|
417
|
-
response= self.gclient.models.generate_content(
|
|
418
|
-
model= self.MODEL_NAME,
|
|
419
|
-
contents=[
|
|
420
|
-
types.Part.from_bytes(
|
|
421
|
-
data=image_bytes.getvalue(),
|
|
422
|
-
mime_type= mime_type
|
|
423
|
-
),
|
|
424
|
-
"Write a caption for this image"
|
|
425
|
-
]
|
|
426
|
-
)
|
|
427
|
-
return response.text
|
|
428
|
-
else:
|
|
429
|
-
# Ensure the image is in the correct shape
|
|
430
|
-
image = np.array(image).transpose(2, 0, 1) # Convert to (C, H, W) format
|
|
431
|
-
|
|
432
|
-
inputs = self.feature_extractor(images=image, return_tensors="pt").to(
|
|
433
|
-
self.device
|
|
434
|
-
)
|
|
435
|
-
pixel_values = inputs.pixel_values
|
|
436
|
-
|
|
437
|
-
generated_ids = self.model.generate(pixel_values, max_length=30)
|
|
409
|
+
caption= self.image_ocr(image, image_bytes, max_new_tokens=15000, prompt= "Write a caption for this image")
|
|
410
|
+
return caption
|
|
438
411
|
|
|
439
|
-
generated_ids = self.model.generate(pixel_values, max_length=30)
|
|
440
|
-
generated_caption = self.tokenizer.batch_decode(
|
|
441
|
-
generated_ids, skip_special_tokens=True
|
|
442
|
-
)[0]
|
|
443
|
-
return generated_caption.strip()
|
|
444
412
|
except Exception as e:
|
|
445
413
|
self.logger.error(f"Error captioning image: {e}")
|
|
446
414
|
self.logger.exception(traceback.format_exc())
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
aiofiles==24.1.0
|
|
2
|
+
annotated-types==0.7.0
|
|
3
|
+
anyio==4.9.0
|
|
4
|
+
audioop-lts==0.2.1
|
|
5
|
+
cachetools==5.5.2
|
|
6
|
+
certifi==2025.6.15
|
|
7
|
+
cffi==1.17.1
|
|
8
|
+
charset-normalizer==3.4.2
|
|
9
|
+
click==8.2.1
|
|
10
|
+
cryptography==45.0.5
|
|
11
|
+
dnspython==2.7.0
|
|
12
|
+
et_xmlfile==2.0.0
|
|
13
|
+
fastapi==0.115.14
|
|
14
|
+
ffmpy==0.6.0
|
|
15
|
+
filelock==3.18.0
|
|
16
|
+
fsspec==2025.5.1
|
|
17
|
+
google-auth==2.40.3
|
|
18
|
+
google-genai==1.24.0
|
|
19
|
+
gradio==5.35.0
|
|
20
|
+
gradio_client==1.10.4
|
|
21
|
+
greenlet==3.2.3
|
|
22
|
+
groovy==0.1.2
|
|
23
|
+
h11==0.16.0
|
|
24
|
+
hf-xet==1.1.5
|
|
25
|
+
httpcore==1.0.9
|
|
26
|
+
httpx==0.28.1
|
|
27
|
+
huggingface-hub==0.33.2
|
|
28
|
+
idna==3.10
|
|
29
|
+
Jinja2==3.1.6
|
|
30
|
+
markdown-it-py==3.0.0
|
|
31
|
+
MarkupSafe==3.0.2
|
|
32
|
+
mdurl==0.1.2
|
|
33
|
+
msgpack==1.1.1
|
|
34
|
+
numpy==2.3.1
|
|
35
|
+
ollama==0.5.1
|
|
36
|
+
openpyxl==3.1.5
|
|
37
|
+
orjson==3.10.18
|
|
38
|
+
packaging==25.0
|
|
39
|
+
pandas==2.3.0
|
|
40
|
+
pathlib==1.0.1
|
|
41
|
+
pdfminer.six==20250506
|
|
42
|
+
pdfplumber==0.11.7
|
|
43
|
+
pembot==0.0.6
|
|
44
|
+
pillow==11.3.0
|
|
45
|
+
pyasn1==0.6.1
|
|
46
|
+
pyasn1_modules==0.4.2
|
|
47
|
+
pycparser==2.22
|
|
48
|
+
pydantic==2.11.7
|
|
49
|
+
pydantic_core==2.33.2
|
|
50
|
+
pydub==0.25.1
|
|
51
|
+
Pygments==2.19.2
|
|
52
|
+
pymongo==4.13.2
|
|
53
|
+
PyMuPDF==1.26.3
|
|
54
|
+
pynvim==0.5.2
|
|
55
|
+
pypdfium2==4.30.1
|
|
56
|
+
pytesseract==0.3.13
|
|
57
|
+
python-dateutil==2.9.0.post0
|
|
58
|
+
python-multipart==0.0.20
|
|
59
|
+
pytz==2025.2
|
|
60
|
+
PyYAML==6.0.2
|
|
61
|
+
requests==2.32.4
|
|
62
|
+
rich==14.0.0
|
|
63
|
+
rsa==4.9.1
|
|
64
|
+
ruff==0.12.1
|
|
65
|
+
safehttpx==0.1.6
|
|
66
|
+
semantic-version==2.10.0
|
|
67
|
+
shellingham==1.5.4
|
|
68
|
+
six==1.17.0
|
|
69
|
+
sniffio==1.3.1
|
|
70
|
+
starlette==0.46.2
|
|
71
|
+
tenacity==8.5.0
|
|
72
|
+
tomlkit==0.13.3
|
|
73
|
+
tqdm==4.67.1
|
|
74
|
+
typer==0.16.0
|
|
75
|
+
typing-inspection==0.4.1
|
|
76
|
+
typing_extensions==4.14.0
|
|
77
|
+
tzdata==2025.2
|
|
78
|
+
urllib3==2.5.0
|
|
79
|
+
uvicorn==0.35.0
|
|
80
|
+
websockets==15.0.1
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
fixed the output_dir bug; fixed the excel to json function; ran some tests on convertor; incremented the version on the package; removed dependency on schema / structure, and shifted required fields to a pickle file path in the cli args;
|
pembot-0.0.5/pembot/.git/index
DELETED
|
Binary file
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0bdb4169fc0f312b8698f1df17a258fff163aeaa
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0bdb4169fc0f312b8698f1df17a258fff163aeaa
|
|
Binary file
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0000000000000000000000000000000000000000 ffb759ee4605b232366a9ee58134532913c3f9e0 cyto <cyto@callisto.localdomain> 1747745478 +0530 clone: from https://github.com/iamarunbrahma/pdf-to-markdown
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0000000000000000000000000000000000000000 ffb759ee4605b232366a9ee58134532913c3f9e0 cyto <cyto@callisto.localdomain> 1747745478 +0530 clone: from https://github.com/iamarunbrahma/pdf-to-markdown
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
ffb759ee4605b232366a9ee58134532913c3f9e0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|