pembot 0.0.3__tar.gz → 0.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pembot might be problematic. Click here for more details.

Files changed (140) hide show
  1. {pembot-0.0.3 → pembot-0.0.5}/PKG-INFO +1 -1
  2. pembot-0.0.5/pembot/.git/COMMIT_EDITMSG +1 -0
  3. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/index +0 -0
  4. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/logs/HEAD +1 -0
  5. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/logs/refs/heads/main +1 -0
  6. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/logs/refs/remotes/origin/main +1 -0
  7. pembot-0.0.5/pembot/.git/objects/0b/db4169fc0f312b8698f1df17a258fff163aeaa +0 -0
  8. pembot-0.0.5/pembot/.git/objects/1f/83a471c8119f7794d98c049170a5d7d07a4b71 +0 -0
  9. pembot-0.0.5/pembot/.git/objects/41/cbeb6bcb4c6fa9ef9be571082d95ecb4ea0ee3 +0 -0
  10. pembot-0.0.5/pembot/.git/objects/63/1700a51c8fa97b543991f5f61bfcd1e7e1327d +0 -0
  11. pembot-0.0.5/pembot/.git/objects/ab/139d2cd4798dd8e2c565b80440b1a44b376126 +0 -0
  12. pembot-0.0.5/pembot/.git/objects/bf/068a0714e2145de83a5c004f4213b091439d0e +0 -0
  13. pembot-0.0.5/pembot/.git/objects/d0/937f7d832266337289d5ec09459f931a46fcf7 +0 -0
  14. pembot-0.0.5/pembot/.git/objects/fc/988aab7e2d46396dc595ad24345e8e77dda0e4 +0 -0
  15. pembot-0.0.5/pembot/.git/refs/heads/main +1 -0
  16. pembot-0.0.5/pembot/.git/refs/remotes/origin/main +1 -0
  17. pembot-0.0.5/pembot/AnyToText/convertor.py +364 -0
  18. {pembot-0.0.3 → pembot-0.0.5}/pembot/__init__.py +1 -1
  19. {pembot-0.0.3 → pembot-0.0.5}/pembot/config/config.yaml +1 -1
  20. {pembot-0.0.3 → pembot-0.0.5}/pembot/main.py +26 -8
  21. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/extract.py +266 -309
  22. {pembot-0.0.3 → pembot-0.0.5}/pembot/query.py +15 -9
  23. pembot-0.0.3/pembot/.git/COMMIT_EDITMSG +0 -1
  24. pembot-0.0.3/pembot/.git/refs/heads/main +0 -1
  25. pembot-0.0.3/pembot/.git/refs/remotes/origin/main +0 -1
  26. pembot-0.0.3/pembot/AnyToText/convertor.py +0 -260
  27. {pembot-0.0.3 → pembot-0.0.5}/LICENSE +0 -0
  28. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/HEAD +0 -0
  29. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/config +0 -0
  30. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/description +0 -0
  31. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/applypatch-msg.sample +0 -0
  32. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/commit-msg.sample +0 -0
  33. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/fsmonitor-watchman.sample +0 -0
  34. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/post-update.sample +0 -0
  35. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/pre-applypatch.sample +0 -0
  36. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/pre-commit.sample +0 -0
  37. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/pre-merge-commit.sample +0 -0
  38. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/pre-push.sample +0 -0
  39. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/pre-rebase.sample +0 -0
  40. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/pre-receive.sample +0 -0
  41. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/prepare-commit-msg.sample +0 -0
  42. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/push-to-checkout.sample +0 -0
  43. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/sendemail-validate.sample +0 -0
  44. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/hooks/update.sample +0 -0
  45. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/info/exclude +0 -0
  46. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/logs/refs/remotes/origin/HEAD +0 -0
  47. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/0a/fb3a98cdc55b1434b44534ec2bf22c56cfa26c +0 -0
  48. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/0c/8d9b2690545bf1906b05cd9f18b783b3eb74f1 +0 -0
  49. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/18/28e18ab80aa64d334b26428708140e280cbc63 +0 -0
  50. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/19/f61df7dbd562d04f561288677bbf2f18f5dff7 +0 -0
  51. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/28/db0ab48059acccd7d257aa02e52e9b6b83a4a5 +0 -0
  52. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/35/97e518a8658280be9f377f78edf1dfa1f23814 +0 -0
  53. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/3d/07d3b29ff53d95de3898fb786d61732f210515 +0 -0
  54. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/3e/cf23eb95123287531d708a21d4ba88d92ccabb +0 -0
  55. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/3f/78215d7e17da726fb352fd92b3c117db9b63ba +0 -0
  56. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/3f/e072cf3cb6a9f30c3e9936e3ddf622e80270d0 +0 -0
  57. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/51/9e780574933d7627a083222bd10dd74f430904 +0 -0
  58. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/61/46a371b9c1bd9f51af273f11f986cfd1bedeba +0 -0
  59. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/64/00040794955d17c9a1fe1aaaea59f2c4822177 +0 -0
  60. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/6d/7a865a23b1cb4182f67907820104ced48b11c9 +0 -0
  61. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/72/f047cda92abcd1ddc857f6461de605f8668331 +0 -0
  62. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/73/2e98f08bc806c331b06847fc8c743f545499e5 +0 -0
  63. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/86/cdaec229f1fbebf43042266b03878944669f25 +0 -0
  64. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/87/d6df5217a4a374f8c1211a05f9bd657f72c9a7 +0 -0
  65. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/8b/5be2af9b16f290549193859c214cd9072212e8 +0 -0
  66. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/93/8f29d9b4b1ae86e39dddf9e3d115a82ddfc9b6 +0 -0
  67. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/9b/123713e30fc9e225f9ac8ff5b02f8f8cf86456 +0 -0
  68. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/ab/c6b15265171457b41e2cfdaf3b8c3994a59eb7 +0 -0
  69. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/ac/9c9018c62fa30dc142665c1b5a375f4e056880 +0 -0
  70. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/b1/1173d9b68db117437ccb9551461152e1e8a77d +0 -0
  71. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/b2/4e79ab07fe9e68781961a25ff9f1dbb1546fbb +0 -0
  72. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/b8/eea52176ffa4d88c5a9976bee26092421565d3 +0 -0
  73. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/bf/32a7e6872e5dc4025ee3df3c921ec7ade0855f +0 -0
  74. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/c0/793458db6e1bee7f79f1a504fb8ff4963f8ed3 +0 -0
  75. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/c2/443060c07101948487cfa93cc39e082e9e0f5f +0 -0
  76. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/e5/3070f2b07f45d031444b09b1b38658f3caf29e +0 -0
  77. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/e7/911a702079a6144997ea4e70f59abbe59ec2bc +0 -0
  78. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/e9/1172752e9a421ae463112d2b0506b37498c98d +0 -0
  79. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/ea/0af89e61a882c5afc2a8c281b2d96f174bfe58 +0 -0
  80. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/eb/75e1c49f1e5b79dca17ccdbec8067756523238 +0 -0
  81. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/f1/655afa1c5636c8d58969e3194bb770aefbc552 +0 -0
  82. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/f4/e991088a63def67a30a2b8bbdb4d58514abab8 +0 -0
  83. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/f8/cbb5bfd1503e66cec2c593362c60a317b6d300 +0 -0
  84. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/f9/98e1f01c2bf0a20159fc851327af05beb3ac88 +0 -0
  85. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/fa/9c9a62ec1203a5868b033ded428c2382c4e1b6 +0 -0
  86. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/fb/6c90c9ce5e0cdfbe074a3f060afc66f62eefde +0 -0
  87. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/fc/e56f1e09d09a05b9babf796fb40bece176f3a2 +0 -0
  88. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.idx +0 -0
  89. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.pack +0 -0
  90. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.rev +0 -0
  91. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/packed-refs +0 -0
  92. {pembot-0.0.3 → pembot-0.0.5}/pembot/.git/refs/remotes/origin/HEAD +0 -0
  93. {pembot-0.0.3 → pembot-0.0.5}/pembot/.gitignore +0 -0
  94. {pembot-0.0.3 → pembot-0.0.5}/pembot/AnyToText/__init__.py +0 -0
  95. {pembot-0.0.3 → pembot-0.0.5}/pembot/LICENSE +0 -0
  96. {pembot-0.0.3 → pembot-0.0.5}/pembot/TextEmbedder/__init__.py +0 -0
  97. {pembot-0.0.3 → pembot-0.0.5}/pembot/TextEmbedder/gemini_embedder.py +0 -0
  98. {pembot-0.0.3 → pembot-0.0.5}/pembot/TextEmbedder/mongodb_embedder.py +0 -0
  99. {pembot-0.0.3 → pembot-0.0.5}/pembot/TextEmbedder/mongodb_index_creator.py +0 -0
  100. {pembot-0.0.3 → pembot-0.0.5}/pembot/TextEmbedder/vector_query.py +0 -0
  101. {pembot-0.0.3 → pembot-0.0.5}/pembot/gartner.py +0 -0
  102. {pembot-0.0.3 → pembot-0.0.5}/pembot/output_structure_local.py +0 -0
  103. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/HEAD +0 -0
  104. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/config +0 -0
  105. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/description +0 -0
  106. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/applypatch-msg.sample +0 -0
  107. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/commit-msg.sample +0 -0
  108. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/fsmonitor-watchman.sample +0 -0
  109. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/post-update.sample +0 -0
  110. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/pre-applypatch.sample +0 -0
  111. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/pre-commit.sample +0 -0
  112. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/pre-merge-commit.sample +0 -0
  113. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/pre-push.sample +0 -0
  114. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/pre-rebase.sample +0 -0
  115. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/pre-receive.sample +0 -0
  116. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/prepare-commit-msg.sample +0 -0
  117. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/push-to-checkout.sample +0 -0
  118. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/sendemail-validate.sample +0 -0
  119. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/hooks/update.sample +0 -0
  120. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/index +0 -0
  121. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/info/exclude +0 -0
  122. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/logs/HEAD +0 -0
  123. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/logs/refs/heads/main +0 -0
  124. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/logs/refs/remotes/origin/HEAD +0 -0
  125. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.idx +0 -0
  126. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.pack +0 -0
  127. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.rev +0 -0
  128. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/packed-refs +0 -0
  129. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/refs/heads/main +0 -0
  130. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/.git/refs/remotes/origin/HEAD +0 -0
  131. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/LICENSE +0 -0
  132. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/README.md +0 -0
  133. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/__init__.py +0 -0
  134. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/config/config.yaml +0 -0
  135. {pembot-0.0.3 → pembot-0.0.5}/pembot/pdf2markdown/requirements.txt +0 -0
  136. {pembot-0.0.3 → pembot-0.0.5}/pembot/pem.py +0 -0
  137. {pembot-0.0.3 → pembot-0.0.5}/pembot/utils/__init__.py +0 -0
  138. {pembot-0.0.3 → pembot-0.0.5}/pembot/utils/inference_client.py +0 -0
  139. {pembot-0.0.3 → pembot-0.0.5}/pembot/utils/string_tools.py +0 -0
  140. {pembot-0.0.3 → pembot-0.0.5}/pyproject.toml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pembot
3
- Version: 0.0.3
3
+ Version: 0.0.5
4
4
  Summary: A Python Package to convert PEM blog content to usseful information by leveraging LLMs
5
5
  Author-email: cyto <aryan_sidhwani@protonmail.com>
6
6
  License-Expression: MIT
@@ -0,0 +1 @@
1
+ fixed the output_dir bug; fixed the excel to json function; ran some tests on convertor; incremented the version on the package; removed dependency on schema / structure, and shifted required fields to a pickle file path in the cli args;
@@ -4,3 +4,4 @@ ac9c9018c62fa30dc142665c1b5a375f4e056880 72f047cda92abcd1ddc857f6461de605f866833
4
4
  72f047cda92abcd1ddc857f6461de605f8668331 e91172752e9a421ae463112d2b0506b37498c98d cyto <silverstone965@gmail.com> 1748881846 +0530 commit: added gemini to the embedders and llms ladders; redeclared the required fields;
5
5
  e91172752e9a421ae463112d2b0506b37498c98d 0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 cyto <silverstone965@gmail.com> 1749716350 +0530 commit: added a pem blog chunking module for updating from local, and, an embedding loop to embed all the blogs, with document id as the filter in the search, and the first line title as the filter in updation
6
6
  0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 eb75e1c49f1e5b79dca17ccdbec8067756523238 cyto <silverstone965@gmail.com> 1750856653 +0530 commit: made arrangements for the cases when custom file bytes are to be processed to text output; handled a ollama running / crashing error
7
+ eb75e1c49f1e5b79dca17ccdbec8067756523238 0bdb4169fc0f312b8698f1df17a258fff163aeaa cyto <silverstone965@gmail.com> 1750937276 +0530 commit: fixed the output_dir bug; fixed the excel to json function; ran some tests on convertor; incremented the version on the package; removed dependency on schema / structure, and shifted required fields to a pickle file path in the cli args;
@@ -4,3 +4,4 @@ ac9c9018c62fa30dc142665c1b5a375f4e056880 72f047cda92abcd1ddc857f6461de605f866833
4
4
  72f047cda92abcd1ddc857f6461de605f8668331 e91172752e9a421ae463112d2b0506b37498c98d cyto <silverstone965@gmail.com> 1748881846 +0530 commit: added gemini to the embedders and llms ladders; redeclared the required fields;
5
5
  e91172752e9a421ae463112d2b0506b37498c98d 0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 cyto <silverstone965@gmail.com> 1749716350 +0530 commit: added a pem blog chunking module for updating from local, and, an embedding loop to embed all the blogs, with document id as the filter in the search, and the first line title as the filter in updation
6
6
  0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 eb75e1c49f1e5b79dca17ccdbec8067756523238 cyto <silverstone965@gmail.com> 1750856653 +0530 commit: made arrangements for the cases when custom file bytes are to be processed to text output; handled a ollama running / crashing error
7
+ eb75e1c49f1e5b79dca17ccdbec8067756523238 0bdb4169fc0f312b8698f1df17a258fff163aeaa cyto <silverstone965@gmail.com> 1750937276 +0530 commit: fixed the output_dir bug; fixed the excel to json function; ran some tests on convertor; incremented the version on the package; removed dependency on schema / structure, and shifted required fields to a pickle file path in the cli args;
@@ -3,3 +3,4 @@ ac9c9018c62fa30dc142665c1b5a375f4e056880 72f047cda92abcd1ddc857f6461de605f866833
3
3
  72f047cda92abcd1ddc857f6461de605f8668331 e91172752e9a421ae463112d2b0506b37498c98d cyto <silverstone965@gmail.com> 1748881859 +0530 update by push
4
4
  e91172752e9a421ae463112d2b0506b37498c98d 0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 cyto <silverstone965@gmail.com> 1749716371 +0530 update by push
5
5
  0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 eb75e1c49f1e5b79dca17ccdbec8067756523238 cyto <silverstone965@gmail.com> 1750856672 +0530 update by push
6
+ eb75e1c49f1e5b79dca17ccdbec8067756523238 0bdb4169fc0f312b8698f1df17a258fff163aeaa cyto <silverstone965@gmail.com> 1750937389 +0530 update by push
@@ -0,0 +1 @@
1
+ 0bdb4169fc0f312b8698f1df17a258fff163aeaa
@@ -0,0 +1 @@
1
+ 0bdb4169fc0f312b8698f1df17a258fff163aeaa
@@ -0,0 +1,364 @@
1
+ from tempfile import TemporaryDirectory
2
+ import mimetypes
3
+ from pathlib import Path
4
+ from pembot.pdf2markdown.extract import MarkdownPDFExtractor
5
+ import os
6
+ import json
7
+ import pandas as pd
8
+ from typing import Literal, Union, Dict, Any, List
9
+ import tempfile
10
+ from datetime import datetime, date
11
+
12
+
13
+ PandasReadEngineType = Literal['xlrd', 'openpyxl', 'odf', 'pyxlsb', 'calamine', None]
14
+
15
+ EXCEL_FILE_TYPES= [
16
+ 'text/csv',
17
+ 'application/vnd.ms-excel',
18
+ 'application/msexcel',
19
+ 'application/x-msexcel',
20
+ 'application/x-ms-excel',
21
+ 'application/x-excel',
22
+ 'application/x-dos_ms_excel',
23
+ 'application/x-dos_ms_excel',
24
+ 'application/xls',
25
+ 'application/x-xls',
26
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
27
+ 'application/vnd.oasis.opendocument.spreadsheet',
28
+ ]
29
+
30
+
31
+ class Convertor():
32
+
33
+
34
+ def __init__(self, myfile: Path | None= None, output_dir: Path | None= None, file_bytes: bytes | None= None, suffix: str | None= None, file_type: str | None= None):
35
+
36
+ self.output= ""
37
+
38
+ # file_type can be pdf, excel, etc.
39
+ if output_dir is None and myfile is None and file_bytes is not None and suffix is not None:
40
+ with tempfile.TemporaryDirectory() as dp:
41
+ with tempfile.NamedTemporaryFile(suffix= suffix, mode= 'wb') as fp:
42
+ fp.write(file_bytes)
43
+ myfile= Path(fp.name)
44
+ output_dir= Path(dp)
45
+ if file_type == 'pdf':
46
+ extractor= MarkdownPDFExtractor(str(myfile), output_path= str(output_dir), page_delimiter= "-- NEXT PAGE --")
47
+ extractor.extract()
48
+ with open(output_dir / (myfile.stem + '.md')) as output_file:
49
+ self.output= output_file.read()
50
+ elif file_type == 'excel':
51
+ self.input_filepath= myfile
52
+ self.json_filepath = output_dir / (myfile.stem + ".json")
53
+ self.convert_file_to_json()
54
+ with open(output_dir / (myfile.stem + '.json')) as output_file:
55
+ self.output= output_file.read()
56
+
57
+ elif output_dir is not None and myfile is not None:
58
+ print("got output path for conversion: ", output_dir)
59
+ mt= mimetypes.guess_file_type(str(myfile))[0]
60
+
61
+ self.output_dir= output_dir
62
+ self.input_filepath= myfile
63
+ base_name, _ = os.path.splitext(myfile.name)
64
+ self.json_filepath = output_dir / 'json' / (base_name + ".json")
65
+
66
+ if mt == 'application/json':
67
+ print("the file was json")
68
+ elif mt == 'application/pdf':
69
+ print("the file was pdf, outputting in: ", output_dir)
70
+ extractor= MarkdownPDFExtractor(str(myfile), output_path= str(self.output_dir), page_delimiter= "-- NEXT PAGE --")
71
+ extractor.extract()
72
+
73
+ elif mt in EXCEL_FILE_TYPES:
74
+ self.convert_file_to_json()
75
+
76
+ else:
77
+ print(mt)
78
+
79
+ def convert_file_to_json(
80
+ self,
81
+ sheet_to_convert: Union[str, int, None] = None, # Relevant for Excel/ODS
82
+ orient: Literal['dict', 'list', 'series', 'split', 'records', 'index'] = 'records', # Corrected type hint
83
+ date_format: Union[str, None] = 'iso', # 'iso', 'epoch', or None
84
+ csv_encoding: str = 'utf-8', # For reading CSV files
85
+ excel_ods_engine: PandasReadEngineType = None # For Excel/ODS, e.g., 'openpyxl', 'xlrd', 'odf'
86
+ ) -> bool:
87
+ """
88
+ Converts an Excel, ODS, or CSV file (or a specific Excel/ODS sheet)
89
+ into an equivalent JSON format.
90
+
91
+ Args:
92
+ sheet_to_convert (str | int | None, optional):
93
+ - For Excel/ODS:
94
+ - If None (default): Converts all sheets. The JSON output will be a
95
+ dictionary where keys are sheet names and values are the JSON
96
+ representation of each sheet.
97
+ - If str: Name of the specific sheet to convert.
98
+ - If int: Index of the specific sheet to convert (0-based).
99
+ If a specific sheet is requested, the JSON output will directly be
100
+ the representation of that sheet.
101
+ - For CSV: This parameter is ignored. The entire CSV is processed.
102
+ orient (str, optional): Pandas DataFrame.to_dict() orientation for each sheet/CSV.
103
+ Default: 'records'. See pandas.DataFrame.to_dict() documentation.
104
+ date_format (str | None, optional): Format for datetime objects.
105
+ - 'iso' (default): ISO8601 format (e.g., '2023-10-27T10:30:00').
106
+ - 'epoch': Milliseconds since epoch.
107
+ - None: Pandas default (often Timestamps). 'iso' is generally safer for JSON.
108
+ csv_encoding (str, optional): Encoding for reading CSV files. Default is 'utf-8'.
109
+ excel_ods_engine (str | None, optional): Pandas engine for reading Excel or ODS files.
110
+ - For Excel: 'openpyxl' (for .xlsx), 'xlrd' (for .xls).
111
+ - For ODS: 'odf' (requires 'odfpy' library).
112
+ If None, pandas auto-detects based on file extension and installed libraries.
113
+
114
+ Returns:
115
+ bool: True if conversion was successful, False otherwise.
116
+ """
117
+
118
+ input_filepath = self.input_filepath
119
+ json_filepath = self.json_filepath
120
+
121
+ try:
122
+
123
+ if not input_filepath.exists():
124
+ print(f"Error: Input file not found at {input_filepath}")
125
+ return False
126
+
127
+ # Ensure output directory exists
128
+ json_filepath.parent.mkdir(parents=True, exist_ok=True)
129
+
130
+ file_suffix = input_filepath.suffix.lower()
131
+ output_data_final: Union[Dict[str, Any], List[Dict[str, Any]]] = {}
132
+
133
+ dataframes_to_process: list[tuple[pd.DataFrame, str | None]] = []
134
+
135
+ current_engine: PandasReadEngineType = excel_ods_engine
136
+
137
+ if file_suffix == '.csv':
138
+ if sheet_to_convert is not None:
139
+ print(f"Info: 'sheet_to_convert' parameter ('{sheet_to_convert}') is ignored for CSV file '{input_filepath.name}'. Processing entire CSV.")
140
+ try:
141
+ df = pd.read_csv(input_filepath, encoding=csv_encoding)
142
+ dataframes_to_process.append((df, None))
143
+ except Exception as e:
144
+ print(f"Error reading CSV file '{input_filepath.name}': {e}")
145
+ return False
146
+
147
+ elif file_suffix in ['.xls', '.xlsx', '.ods']:
148
+ try:
149
+ if file_suffix == '.ods':
150
+ if current_engine is None:
151
+ current_engine = 'odf'
152
+ elif current_engine != 'odf':
153
+ print(f"Warning: Specified engine '{current_engine}' may not be optimal for ODS. Forcing 'odf'.")
154
+ current_engine = 'odf'
155
+
156
+ if sheet_to_convert is not None:
157
+ df = pd.read_excel(input_filepath, sheet_name=sheet_to_convert, engine=current_engine)
158
+ dataframes_to_process.append((df, None))
159
+
160
+ else:
161
+ excel_file = pd.ExcelFile(input_filepath, engine=current_engine)
162
+ if not excel_file.sheet_names:
163
+ print(f"Warning: File '{input_filepath.name}' contains no sheets.")
164
+ for sheet_name in excel_file.sheet_names:
165
+ df = excel_file.parse(sheet_name) # engine is inherited
166
+ dataframes_to_process.append((df, sheet_name))
167
+ except ImportError as ie:
168
+ if 'odfpy' in str(ie).lower() and file_suffix == '.ods':
169
+ print(f"Error reading ODS file '{input_filepath.name}': The 'odfpy' library is required. Please install it using 'pip install odfpy'.")
170
+ elif 'xlrd' in str(ie).lower() and file_suffix == '.xls':
171
+ print(f"Error reading .xls file '{input_filepath.name}': The 'xlrd' library might be required. Please install it using 'pip install xlrd'.")
172
+ elif 'openpyxl' in str(ie).lower() and file_suffix == '.xlsx':
173
+ print(f"Error reading .xlsx file '{input_filepath.name}': The 'openpyxl' library might be required. Please install it using 'pip install openpyxl'.")
174
+ else:
175
+ print(f"ImportError reading file '{input_filepath.name}': {ie}")
176
+ return False
177
+ except Exception as e:
178
+ print(f"Error reading Excel/ODS file '{input_filepath.name}': {e}")
179
+ return False
180
+ else:
181
+ print(f"Error: Unsupported file type: '{file_suffix}'. Please provide a CSV, XLS, XLSX, or ODS file.")
182
+ return False
183
+
184
+ if not dataframes_to_process and file_suffix in ['.xls', '.xlsx', '.ods'] and sheet_to_convert is None:
185
+ print(f"Info: No dataframes were loaded from '{input_filepath.name}'. Output JSON will be empty if processing all sheets from an empty file.")
186
+ elif not dataframes_to_process and not (file_suffix in ['.xls', '.xlsx', '.ods'] and sheet_to_convert is None):
187
+ pass
188
+
189
+ is_direct_output = len(dataframes_to_process) == 1 and dataframes_to_process[0][1] is None
190
+ temp_processed_data: Dict[str, Any] = {}
191
+
192
+ for df_original, name_key in dataframes_to_process:
193
+ df = df_original.copy()
194
+
195
+ # Handle datetime columns with improved detection and conversion
196
+ if date_format:
197
+ # Check for datetime columns using multiple approaches
198
+ datetime_columns = []
199
+
200
+ # Method 1: Use pandas dtype detection
201
+ datetime_columns.extend(df.select_dtypes(include=['datetime64[ns]', 'datetime', 'datetimetz']).columns.tolist())
202
+
203
+ # Method 2: Check for datetime objects in each column
204
+ for col in df.columns:
205
+ if col not in datetime_columns:
206
+ # Sample a few non-null values to check type
207
+ sample_values = df[col].dropna().head(10)
208
+ if len(sample_values) > 0:
209
+ for val in sample_values:
210
+ if isinstance(val, (datetime, date, pd.Timestamp)):
211
+ datetime_columns.append(col)
212
+ break
213
+
214
+ # Convert datetime columns
215
+ for col_name in datetime_columns:
216
+ try:
217
+ if date_format == 'iso':
218
+ df[col_name] = df[col_name].apply(lambda x: self._convert_to_iso(x))
219
+ elif date_format == 'epoch':
220
+ df[col_name] = df[col_name].apply(lambda x: self._convert_to_epoch(x))
221
+ except Exception as e_date:
222
+ print(f"Warning: Could not fully convert date column '{col_name}' in '{name_key or input_filepath.name}' using format '{date_format}'. Error: {e_date}")
223
+
224
+ # Replace NaN values with None for JSON compatibility
225
+ df = df.astype(object).where(pd.notnull(df), None)
226
+
227
+ # Final safety check: convert any remaining datetime objects
228
+ for col in df.columns:
229
+ df[col] = df[col].apply(lambda x: self._safe_datetime_convert(x, date_format))
230
+
231
+ current_json_segment = df.to_dict(orient=orient)
232
+
233
+ if is_direct_output:
234
+ output_data_final = current_json_segment
235
+ break
236
+ else:
237
+ if name_key is not None:
238
+ temp_processed_data[name_key] = current_json_segment
239
+
240
+ if not is_direct_output:
241
+ output_data_final = temp_processed_data
242
+
243
+ with open(json_filepath, 'w', encoding='utf-8') as f:
244
+ json.dump(output_data_final, f, indent=4, ensure_ascii=False)
245
+
246
+ print(f"Successfully converted '{input_filepath.name}' to '{json_filepath.name}'")
247
+ return True
248
+
249
+ except FileNotFoundError:
250
+ print(f"Error: Input file not found at {input_filepath.name}")
251
+ return False
252
+ except ValueError as ve:
253
+ print(f"ValueError during conversion of '{input_filepath.name}': {ve}")
254
+ return False
255
+ except Exception as e:
256
+ print(f"An unexpected error occurred during conversion of '{input_filepath.name}': {e}")
257
+ return False
258
+
259
+ def _convert_to_iso(self, value):
260
+ """Convert datetime-like objects to ISO format string."""
261
+ if pd.isnull(value) or value is None:
262
+ return None
263
+
264
+ try:
265
+ if isinstance(value, str):
266
+ return value # Already a string
267
+ elif hasattr(value, 'isoformat'):
268
+ return value.isoformat()
269
+ elif isinstance(value, pd.Timestamp):
270
+ return value.isoformat()
271
+ else:
272
+ return str(value)
273
+ except:
274
+ return str(value) if value is not None else None
275
+
276
+ def _convert_to_epoch(self, value):
277
+ """Convert datetime-like objects to epoch milliseconds."""
278
+ if pd.isnull(value) or value is None:
279
+ return None
280
+
281
+ try:
282
+ if isinstance(value, (int, float)):
283
+ return int(value) # Assume already epoch
284
+ elif hasattr(value, 'timestamp'):
285
+ return int(value.timestamp() * 1000)
286
+ elif isinstance(value, pd.Timestamp):
287
+ return int(value.timestamp() * 1000)
288
+ else:
289
+ return str(value)
290
+ except:
291
+ return str(value) if value is not None else None
292
+
293
+ def _safe_datetime_convert(self, value, date_format):
294
+ """Final safety conversion for any remaining datetime objects."""
295
+ if pd.isnull(value) or value is None:
296
+ return None
297
+
298
+ # If it's a datetime-like object, convert it
299
+ if isinstance(value, (datetime, date, pd.Timestamp)):
300
+ if date_format == 'iso':
301
+ return self._convert_to_iso(value)
302
+ elif date_format == 'epoch':
303
+ return self._convert_to_epoch(value)
304
+ else:
305
+ return str(value)
306
+
307
+ return value
308
+
309
+
310
+ def chunk_text(text, chunk_size=500, overlap_size=50):
311
+ """
312
+ Chunks a given text into smaller pieces with optional overlap.
313
+
314
+ Args:
315
+ text (str): The input text to be chunked.
316
+ chunk_size (int): The maximum size of each chunk (in characters).
317
+ overlap_size (int): The number of characters to overlap between consecutive chunks.
318
+
319
+ Returns:
320
+ list: A list of text chunks.
321
+ """
322
+ chunks = []
323
+ start = 0
324
+ while start < len(text):
325
+ end = start + chunk_size
326
+ chunk = text[start:end]
327
+ chunks.append(chunk)
328
+ start += (chunk_size - overlap_size)
329
+ if start < 0: # Handle cases where overlap_size is greater than chunk_size
330
+ start = 0
331
+ return chunks
332
+
333
+ if __name__ == '__main__':
334
+ print("Test Run Start:")
335
+ try:
336
+ # print("Test 1: scaned pdf page, bytes")
337
+ # with open("/home/cyto/Documents/scanned.pdf", "rb") as imgpdf:
338
+ # conv= Convertor(file_bytes= imgpdf.read(), suffix= ".pdf", file_type= "pdf")
339
+ # print(conv.output)
340
+
341
+ # print("Test 2: JD pdf, bytes")
342
+ # with open("/home/cyto/dev/pembotdir/jds/PM Trainee.pdf", "rb") as imgpdf:
343
+ # conv= Convertor(file_bytes= imgpdf.read(), suffix= ".pdf", file_type= "pdf")
344
+ # print(conv.output)
345
+
346
+ # print("Test 3: excel schedule, bytes")
347
+ # with open("/home/cyto/Downloads/Assignment schedule.xlsx", "rb") as imgpdf:
348
+ # conv= Convertor(file_bytes= imgpdf.read(), suffix= ".xlsx", file_type= "excel")
349
+ # print(conv.output)
350
+
351
+ # without bytes example:
352
+ print("Test 4: scanned pdf, path")
353
+ conv= Convertor(myfile= Path('/home/cyto/Documents/scanned.pdf'), output_dir= Path('/home/cyto/Documents'))
354
+ print(conv.output)
355
+
356
+ # print("Test 5: schedule excel, path")
357
+ # conv= Convertor(myfile= Path('/home/cyto/Downloads/Assignment schedule.xlsx'), output_dir= Path('/home/cyto/Downloads'))
358
+ # print(conv.output)
359
+ except FileNotFoundError as fe:
360
+ print("file not found, modify the driver code to get sample files to test:\n\n", fe)
361
+ except Exception as e:
362
+ print("unhandled: ", e)
363
+
364
+ print("Test Run End.")
@@ -1,6 +1,6 @@
1
1
  """
2
2
  A Python Package to convert PEM blog content to usseful information by leveraging LLMs
3
3
  """
4
- __version__ = '0.0.3'
4
+ __version__ = '0.0.5'
5
5
  from .main import save_to_json_file, make_query
6
6
  __all__ = ["save_to_json_file", "make_query"]
@@ -2,4 +2,4 @@ OUTPUT_DIR: /home/cyto/dev/pembotdir
2
2
  PAGE_DELIMITER: ___________________________ NEXT PAGE ___________________________
3
3
  app:
4
4
  name: pembot
5
- version: 0.0.3
5
+ version: 0.0.5
@@ -10,7 +10,11 @@ from pembot.query import rag_query_llm, remove_bs
10
10
  import os
11
11
  import json
12
12
  from pembot.utils.string_tools import make_it_an_id
13
- from schema.structure import required_fields
13
+ import pickle
14
+ from sys import argv
15
+
16
+ required_fields_path= ""
17
+ required_fields= None
14
18
 
15
19
 
16
20
  def make_query(required_fields: list[tuple[str, str, str, str]]):
@@ -67,8 +71,8 @@ def save_to_json_file(llm_output: str, filepath: Path):
67
71
  except Exception as e:
68
72
  print(f"An unexpected error occurred in save_to_json_file: {e}")
69
73
 
70
- def make_document_summarization_and_embeddings(db_client, llm_client, inference_client, docs_dir: Path, text_out_dir: Path, required_fields: list[tuple[str, str, str, str]], chunk_size: int = 600, embedding_model: str= 'nomic-embed-text:v1.5', llm_provider_name: PROVIDER_T= "novita", model_name= "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", embeddings_collection: str= "doc_chunks", index_name= "test_search"):
71
- # give required output fields
74
+ def make_document_summarization_and_embeddings(db_client, llm_client, inference_client, docs_dir: Path, text_out_dir: Path, required_fields: list[tuple[str, str, str, str]], chunk_size: int = 600, embedding_model: str= 'nomic-embed-text:v1.5', llm_provider_name: PROVIDER_T= "novita", model_name= "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", embeddings_collection: str= "doc_chunks", index_name= "test_search"):
75
+ # give required output fields
72
76
  # take the documents
73
77
  # convert to text
74
78
  # upload to chromadb
@@ -80,7 +84,7 @@ def make_document_summarization_and_embeddings(db_client, llm_client, inference_
80
84
  expected_json= text_out_dir / 'json' / (file_root + '.json')
81
85
  document_id= make_it_an_id(file_root)
82
86
 
83
- if docfile.is_file and not (expected_json).exists():
87
+ if docfile.is_file and not (expected_json).exists():
84
88
 
85
89
  expected_markdown= text_out_dir / (file_root + '.md')
86
90
  if not (expected_markdown).exists():
@@ -161,6 +165,23 @@ if __name__ == "__main__":
161
165
  # provider="Jina AI",
162
166
  # api_key= JINA_API_KEY,
163
167
  # )
168
+ #
169
+
170
+ try:
171
+ if len(argv) > 1:
172
+ print(f"First argument: {argv[1]}")
173
+ required_fields_path= argv[1]
174
+ with open(required_fields_path, "rb") as rf:
175
+ required_fields= pickle.load(rf)
176
+ except Exception as e:
177
+ print("error while getting required_fields pickle. Please pickle it and put it in project directory to continue\n", e)
178
+
179
+ if required_fields is None:
180
+ print("couldnt load required fields. please provide path to pickle in command line argument")
181
+ exit()
182
+ else:
183
+ print(required_fields)
184
+
164
185
 
165
186
  inference_client= InferenceClient(
166
187
  provider="hf-inference",
@@ -178,7 +199,7 @@ if __name__ == "__main__":
178
199
  llm_provider_name: PROVIDER_T="nebius"
179
200
 
180
201
  # nerfed, but provided by hf serverless inference: BAAI/bge-small-en-v1.5
181
- # Worth mentioning:
202
+ # Worth mentioning:
182
203
  # jinaai/jina-embeddings-v3
183
204
  # BAAI/bge-base-en-v1.5
184
205
  # nomic-ai/nomic-embed-text-v1.5
@@ -203,6 +224,3 @@ if __name__ == "__main__":
203
224
 
204
225
  docs_collection= database["summary_docs"]
205
226
  upload_summaries(process_output_dir / 'json', docs_collection)
206
-
207
-
208
-