pembot 0.0.5__tar.gz → 0.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pembot might be problematic. Click here for more details.

Files changed (180) hide show
  1. {pembot-0.0.5 → pembot-0.0.7}/PKG-INFO +1 -1
  2. pembot-0.0.7/pembot/.git/COMMIT_EDITMSG +1 -0
  3. pembot-0.0.7/pembot/.git/index +0 -0
  4. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/logs/HEAD +3 -0
  5. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/logs/refs/heads/main +3 -0
  6. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/logs/refs/remotes/origin/main +3 -0
  7. pembot-0.0.7/pembot/.git/objects/0c/ab66ffbaf50ef60dd41f3498595ebd2526b33c +0 -0
  8. pembot-0.0.7/pembot/.git/objects/3e/23850624fcf5f111d6ea88ddd64adf924cf82f +0 -0
  9. pembot-0.0.7/pembot/.git/objects/41/ae8fa8f8baa2daee5ec0aa21ae17922ae051a0 +0 -0
  10. pembot-0.0.7/pembot/.git/objects/4d/a03134f70896f72053fbdc0cd4f4c76d4ac1d8 +0 -0
  11. pembot-0.0.7/pembot/.git/objects/50/39b29fda67743a044993436df6a4a1db7b8888 +0 -0
  12. pembot-0.0.7/pembot/.git/objects/7a/7d28b0313a3d9d509823faaae31949af8610ef +0 -0
  13. pembot-0.0.7/pembot/.git/objects/7e/0907822f7d316ebe0be07e1f6918bef412c80b +0 -0
  14. pembot-0.0.7/pembot/.git/objects/95/28bbccd167e3f4ad583a1ae9fac98a52620e27 +0 -0
  15. pembot-0.0.7/pembot/.git/objects/af/80ddb5890f062e364ea8ade2d602df4e12de8c +0 -0
  16. pembot-0.0.7/pembot/.git/objects/b8/884c6145221ac66f84bf88919754c2cb05c12d +0 -0
  17. pembot-0.0.7/pembot/.git/objects/bd/8fd1cb166996e74a8631f3a6f764a53af75297 +0 -0
  18. pembot-0.0.7/pembot/.git/objects/bf/518686b06069d2a8abd3689908b7e1a6e16b05 +0 -0
  19. pembot-0.0.7/pembot/.git/objects/e0/9162dbd64d85bb5ed740aa99faefa73f293d78 +0 -0
  20. pembot-0.0.7/pembot/.git/objects/ee/a73c7f24094ed83b014f7cfce46e10f817bec8 +0 -0
  21. pembot-0.0.7/pembot/.git/objects/ef/0503a60244391590b16042019032e91d7cc30d +3 -0
  22. pembot-0.0.7/pembot/.git/objects/f6/b1d54483ce20fbcb252a8a93a5eff7bec88729 +0 -0
  23. pembot-0.0.7/pembot/.git/objects/f8/6fbd490878cb0d3c35cc4443672d1309171bf1 +0 -0
  24. pembot-0.0.7/pembot/.git/refs/heads/main +1 -0
  25. pembot-0.0.7/pembot/.git/refs/remotes/origin/main +1 -0
  26. {pembot-0.0.5 → pembot-0.0.7}/pembot/AnyToText/convertor.py +11 -7
  27. {pembot-0.0.5 → pembot-0.0.7}/pembot/__init__.py +1 -1
  28. {pembot-0.0.5 → pembot-0.0.7}/pembot/config/config.yaml +1 -1
  29. pembot-0.0.7/pembot/pdf2markdown/.git/COMMIT_EDITMSG +1 -0
  30. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/config +3 -0
  31. pembot-0.0.7/pembot/pdf2markdown/.git/index +0 -0
  32. pembot-0.0.7/pembot/pdf2markdown/.git/logs/HEAD +5 -0
  33. pembot-0.0.7/pembot/pdf2markdown/.git/logs/refs/heads/main +5 -0
  34. pembot-0.0.7/pembot/pdf2markdown/.git/logs/refs/remotes/myorigin/main +4 -0
  35. pembot-0.0.7/pembot/pdf2markdown/.git/objects/14/251b198e0bac39a3dc3b42f9e57b20c01465fb +0 -0
  36. pembot-0.0.7/pembot/pdf2markdown/.git/objects/24/7b15a6b1e0e3d270c05af184f048736376cd4e +0 -0
  37. pembot-0.0.7/pembot/pdf2markdown/.git/objects/24/8f03b5f969a7fbd396b496f40b57f0ae81c148 +0 -0
  38. pembot-0.0.7/pembot/pdf2markdown/.git/objects/57/74dc9c3901d2ffb2cd7dafe2ad6612a7f9f42c +0 -0
  39. pembot-0.0.7/pembot/pdf2markdown/.git/objects/72/2dc14f82e78ce41717348b256e0c17834933b4 +0 -0
  40. pembot-0.0.7/pembot/pdf2markdown/.git/objects/79/eb7b93ced70e399bd561093c45de7641414dbd +0 -0
  41. pembot-0.0.7/pembot/pdf2markdown/.git/objects/8d/9ce1fd9733a78c592b34af9c94b98960c601ed +0 -0
  42. pembot-0.0.7/pembot/pdf2markdown/.git/objects/95/745843bb4377d6042180daeda818c0b16fd493 +0 -0
  43. pembot-0.0.7/pembot/pdf2markdown/.git/objects/a5/c6dfb577782c259990dcf977e355298e923428 +0 -0
  44. pembot-0.0.7/pembot/pdf2markdown/.git/objects/a7/4bcd5e67cb1066dd504b92b42390fe0b2c3d38 +0 -0
  45. pembot-0.0.7/pembot/pdf2markdown/.git/objects/b4/8d697aa9fd97151eb2a84a1af5d408b7630232 +0 -0
  46. pembot-0.0.7/pembot/pdf2markdown/.git/objects/b8/702320e56074e9680181d8b7897d6a0a552e2d +0 -0
  47. pembot-0.0.7/pembot/pdf2markdown/.git/objects/e6/9de29bb2d1d6434b8b29ae775ad8c2e48c5391 +0 -0
  48. pembot-0.0.7/pembot/pdf2markdown/.git/objects/f3/b2d76c75bbd50e04fc4c2ad17fc94ca6daed32 +1 -0
  49. pembot-0.0.7/pembot/pdf2markdown/.git/refs/heads/main +1 -0
  50. pembot-0.0.7/pembot/pdf2markdown/.git/refs/remotes/myorigin/main +1 -0
  51. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/extract.py +84 -91
  52. pembot-0.0.7/pembot/pdf2markdown/pyrightconfig.json +4 -0
  53. pembot-0.0.7/pembot/requirements.txt +80 -0
  54. pembot-0.0.5/pembot/.git/COMMIT_EDITMSG +0 -1
  55. pembot-0.0.5/pembot/.git/index +0 -0
  56. pembot-0.0.5/pembot/.git/refs/heads/main +0 -1
  57. pembot-0.0.5/pembot/.git/refs/remotes/origin/main +0 -1
  58. pembot-0.0.5/pembot/pdf2markdown/.git/index +0 -0
  59. pembot-0.0.5/pembot/pdf2markdown/.git/logs/refs/heads/main +0 -1
  60. pembot-0.0.5/pembot/pdf2markdown/.git/logs/refs/remotes/origin/HEAD +0 -1
  61. pembot-0.0.5/pembot/pdf2markdown/.git/refs/heads/main +0 -1
  62. {pembot-0.0.5 → pembot-0.0.7}/LICENSE +0 -0
  63. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/HEAD +0 -0
  64. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/config +0 -0
  65. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/description +0 -0
  66. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/hooks/applypatch-msg.sample +0 -0
  67. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/hooks/commit-msg.sample +0 -0
  68. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/hooks/fsmonitor-watchman.sample +0 -0
  69. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/hooks/post-update.sample +0 -0
  70. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/hooks/pre-applypatch.sample +0 -0
  71. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/hooks/pre-commit.sample +0 -0
  72. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/hooks/pre-merge-commit.sample +0 -0
  73. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/hooks/pre-push.sample +0 -0
  74. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/hooks/pre-rebase.sample +0 -0
  75. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/hooks/pre-receive.sample +0 -0
  76. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/hooks/prepare-commit-msg.sample +0 -0
  77. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/hooks/push-to-checkout.sample +0 -0
  78. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/hooks/sendemail-validate.sample +0 -0
  79. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/hooks/update.sample +0 -0
  80. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/info/exclude +0 -0
  81. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/logs/refs/remotes/origin/HEAD +0 -0
  82. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/0a/fb3a98cdc55b1434b44534ec2bf22c56cfa26c +0 -0
  83. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/0b/db4169fc0f312b8698f1df17a258fff163aeaa +0 -0
  84. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/0c/8d9b2690545bf1906b05cd9f18b783b3eb74f1 +0 -0
  85. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/18/28e18ab80aa64d334b26428708140e280cbc63 +0 -0
  86. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/19/f61df7dbd562d04f561288677bbf2f18f5dff7 +0 -0
  87. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/1f/83a471c8119f7794d98c049170a5d7d07a4b71 +0 -0
  88. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/28/db0ab48059acccd7d257aa02e52e9b6b83a4a5 +0 -0
  89. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/35/97e518a8658280be9f377f78edf1dfa1f23814 +0 -0
  90. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/3d/07d3b29ff53d95de3898fb786d61732f210515 +0 -0
  91. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/3e/cf23eb95123287531d708a21d4ba88d92ccabb +0 -0
  92. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/3f/78215d7e17da726fb352fd92b3c117db9b63ba +0 -0
  93. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/3f/e072cf3cb6a9f30c3e9936e3ddf622e80270d0 +0 -0
  94. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/41/cbeb6bcb4c6fa9ef9be571082d95ecb4ea0ee3 +0 -0
  95. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/51/9e780574933d7627a083222bd10dd74f430904 +0 -0
  96. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/61/46a371b9c1bd9f51af273f11f986cfd1bedeba +0 -0
  97. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/63/1700a51c8fa97b543991f5f61bfcd1e7e1327d +0 -0
  98. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/64/00040794955d17c9a1fe1aaaea59f2c4822177 +0 -0
  99. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/6d/7a865a23b1cb4182f67907820104ced48b11c9 +0 -0
  100. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/72/f047cda92abcd1ddc857f6461de605f8668331 +0 -0
  101. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/73/2e98f08bc806c331b06847fc8c743f545499e5 +0 -0
  102. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/86/cdaec229f1fbebf43042266b03878944669f25 +0 -0
  103. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/87/d6df5217a4a374f8c1211a05f9bd657f72c9a7 +0 -0
  104. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/8b/5be2af9b16f290549193859c214cd9072212e8 +0 -0
  105. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/93/8f29d9b4b1ae86e39dddf9e3d115a82ddfc9b6 +0 -0
  106. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/9b/123713e30fc9e225f9ac8ff5b02f8f8cf86456 +0 -0
  107. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/ab/139d2cd4798dd8e2c565b80440b1a44b376126 +0 -0
  108. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/ab/c6b15265171457b41e2cfdaf3b8c3994a59eb7 +0 -0
  109. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/ac/9c9018c62fa30dc142665c1b5a375f4e056880 +0 -0
  110. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/b1/1173d9b68db117437ccb9551461152e1e8a77d +0 -0
  111. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/b2/4e79ab07fe9e68781961a25ff9f1dbb1546fbb +0 -0
  112. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/b8/eea52176ffa4d88c5a9976bee26092421565d3 +0 -0
  113. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/bf/068a0714e2145de83a5c004f4213b091439d0e +0 -0
  114. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/bf/32a7e6872e5dc4025ee3df3c921ec7ade0855f +0 -0
  115. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/c0/793458db6e1bee7f79f1a504fb8ff4963f8ed3 +0 -0
  116. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/c2/443060c07101948487cfa93cc39e082e9e0f5f +0 -0
  117. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/d0/937f7d832266337289d5ec09459f931a46fcf7 +0 -0
  118. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/e5/3070f2b07f45d031444b09b1b38658f3caf29e +0 -0
  119. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/e7/911a702079a6144997ea4e70f59abbe59ec2bc +0 -0
  120. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/e9/1172752e9a421ae463112d2b0506b37498c98d +0 -0
  121. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/ea/0af89e61a882c5afc2a8c281b2d96f174bfe58 +0 -0
  122. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/eb/75e1c49f1e5b79dca17ccdbec8067756523238 +0 -0
  123. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/f1/655afa1c5636c8d58969e3194bb770aefbc552 +0 -0
  124. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/f4/e991088a63def67a30a2b8bbdb4d58514abab8 +0 -0
  125. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/f8/cbb5bfd1503e66cec2c593362c60a317b6d300 +0 -0
  126. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/f9/98e1f01c2bf0a20159fc851327af05beb3ac88 +0 -0
  127. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/fa/9c9a62ec1203a5868b033ded428c2382c4e1b6 +0 -0
  128. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/fb/6c90c9ce5e0cdfbe074a3f060afc66f62eefde +0 -0
  129. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/fc/988aab7e2d46396dc595ad24345e8e77dda0e4 +0 -0
  130. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/fc/e56f1e09d09a05b9babf796fb40bece176f3a2 +0 -0
  131. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.idx +0 -0
  132. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.pack +0 -0
  133. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.rev +0 -0
  134. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/packed-refs +0 -0
  135. {pembot-0.0.5 → pembot-0.0.7}/pembot/.git/refs/remotes/origin/HEAD +0 -0
  136. {pembot-0.0.5 → pembot-0.0.7}/pembot/.gitignore +0 -0
  137. {pembot-0.0.5 → pembot-0.0.7}/pembot/AnyToText/__init__.py +0 -0
  138. {pembot-0.0.5 → pembot-0.0.7}/pembot/LICENSE +0 -0
  139. {pembot-0.0.5 → pembot-0.0.7}/pembot/TextEmbedder/__init__.py +0 -0
  140. {pembot-0.0.5 → pembot-0.0.7}/pembot/TextEmbedder/gemini_embedder.py +0 -0
  141. {pembot-0.0.5 → pembot-0.0.7}/pembot/TextEmbedder/mongodb_embedder.py +0 -0
  142. {pembot-0.0.5 → pembot-0.0.7}/pembot/TextEmbedder/mongodb_index_creator.py +0 -0
  143. {pembot-0.0.5 → pembot-0.0.7}/pembot/TextEmbedder/vector_query.py +0 -0
  144. {pembot-0.0.5 → pembot-0.0.7}/pembot/gartner.py +0 -0
  145. {pembot-0.0.5 → pembot-0.0.7}/pembot/main.py +0 -0
  146. {pembot-0.0.5 → pembot-0.0.7}/pembot/output_structure_local.py +0 -0
  147. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/HEAD +0 -0
  148. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/description +0 -0
  149. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/hooks/applypatch-msg.sample +0 -0
  150. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/hooks/commit-msg.sample +0 -0
  151. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/hooks/fsmonitor-watchman.sample +0 -0
  152. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/hooks/post-update.sample +0 -0
  153. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/hooks/pre-applypatch.sample +0 -0
  154. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/hooks/pre-commit.sample +0 -0
  155. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/hooks/pre-merge-commit.sample +0 -0
  156. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/hooks/pre-push.sample +0 -0
  157. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/hooks/pre-rebase.sample +0 -0
  158. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/hooks/pre-receive.sample +0 -0
  159. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/hooks/prepare-commit-msg.sample +0 -0
  160. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/hooks/push-to-checkout.sample +0 -0
  161. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/hooks/sendemail-validate.sample +0 -0
  162. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/hooks/update.sample +0 -0
  163. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/info/exclude +0 -0
  164. {pembot-0.0.5/pembot/pdf2markdown/.git/logs → pembot-0.0.7/pembot/pdf2markdown/.git/logs/refs/remotes/origin}/HEAD +0 -0
  165. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.idx +0 -0
  166. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.pack +0 -0
  167. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.rev +0 -0
  168. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/packed-refs +0 -0
  169. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/.git/refs/remotes/origin/HEAD +0 -0
  170. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/LICENSE +0 -0
  171. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/README.md +0 -0
  172. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/__init__.py +0 -0
  173. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/config/config.yaml +0 -0
  174. {pembot-0.0.5 → pembot-0.0.7}/pembot/pdf2markdown/requirements.txt +0 -0
  175. {pembot-0.0.5 → pembot-0.0.7}/pembot/pem.py +0 -0
  176. {pembot-0.0.5 → pembot-0.0.7}/pembot/query.py +0 -0
  177. {pembot-0.0.5 → pembot-0.0.7}/pembot/utils/__init__.py +0 -0
  178. {pembot-0.0.5 → pembot-0.0.7}/pembot/utils/inference_client.py +0 -0
  179. {pembot-0.0.5 → pembot-0.0.7}/pembot/utils/string_tools.py +0 -0
  180. {pembot-0.0.5 → pembot-0.0.7}/pyproject.toml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pembot
3
- Version: 0.0.5
3
+ Version: 0.0.7
4
4
  Summary: A Python Package to convert PEM blog content to usseful information by leveraging LLMs
5
5
  Author-email: cyto <aryan_sidhwani@protonmail.com>
6
6
  License-Expression: MIT
@@ -0,0 +1 @@
1
+ added model name to convertor
Binary file
@@ -5,3 +5,6 @@ ac9c9018c62fa30dc142665c1b5a375f4e056880 72f047cda92abcd1ddc857f6461de605f866833
5
5
  e91172752e9a421ae463112d2b0506b37498c98d 0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 cyto <silverstone965@gmail.com> 1749716350 +0530 commit: added a pem blog chunking module for updating from local, and, an embedding loop to embed all the blogs, with document id as the filter in the search, and the first line title as the filter in updation
6
6
  0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 eb75e1c49f1e5b79dca17ccdbec8067756523238 cyto <silverstone965@gmail.com> 1750856653 +0530 commit: made arrangements for the cases when custom file bytes are to be processed to text output; handled a ollama running / crashing error
7
7
  eb75e1c49f1e5b79dca17ccdbec8067756523238 0bdb4169fc0f312b8698f1df17a258fff163aeaa cyto <silverstone965@gmail.com> 1750937276 +0530 commit: fixed the output_dir bug; fixed the excel to json function; ran some tests on convertor; incremented the version on the package; removed dependency on schema / structure, and shifted required fields to a pickle file path in the cli args;
8
+ 0bdb4169fc0f312b8698f1df17a258fff163aeaa 9528bbccd167e3f4ad583a1ae9fac98a52620e27 cyto <silverstone965@gmail.com> 1750947488 +0530 commit: handled local llm nonexistent error properly for choice of just passing None as llm_client;
9
+ 9528bbccd167e3f4ad583a1ae9fac98a52620e27 ef0503a60244391590b16042019032e91d7cc30d cyto <silverstone965@gmail.com> 1751872559 +0530 commit: added a model_name_parameter to change models quicky
10
+ ef0503a60244391590b16042019032e91d7cc30d af80ddb5890f062e364ea8ade2d602df4e12de8c cyto <silverstone965@gmail.com> 1751896700 +0530 commit: added model name to convertor
@@ -5,3 +5,6 @@ ac9c9018c62fa30dc142665c1b5a375f4e056880 72f047cda92abcd1ddc857f6461de605f866833
5
5
  e91172752e9a421ae463112d2b0506b37498c98d 0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 cyto <silverstone965@gmail.com> 1749716350 +0530 commit: added a pem blog chunking module for updating from local, and, an embedding loop to embed all the blogs, with document id as the filter in the search, and the first line title as the filter in updation
6
6
  0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 eb75e1c49f1e5b79dca17ccdbec8067756523238 cyto <silverstone965@gmail.com> 1750856653 +0530 commit: made arrangements for the cases when custom file bytes are to be processed to text output; handled a ollama running / crashing error
7
7
  eb75e1c49f1e5b79dca17ccdbec8067756523238 0bdb4169fc0f312b8698f1df17a258fff163aeaa cyto <silverstone965@gmail.com> 1750937276 +0530 commit: fixed the output_dir bug; fixed the excel to json function; ran some tests on convertor; incremented the version on the package; removed dependency on schema / structure, and shifted required fields to a pickle file path in the cli args;
8
+ 0bdb4169fc0f312b8698f1df17a258fff163aeaa 9528bbccd167e3f4ad583a1ae9fac98a52620e27 cyto <silverstone965@gmail.com> 1750947488 +0530 commit: handled local llm nonexistent error properly for choice of just passing None as llm_client;
9
+ 9528bbccd167e3f4ad583a1ae9fac98a52620e27 ef0503a60244391590b16042019032e91d7cc30d cyto <silverstone965@gmail.com> 1751872559 +0530 commit: added a model_name_parameter to change models quicky
10
+ ef0503a60244391590b16042019032e91d7cc30d af80ddb5890f062e364ea8ade2d602df4e12de8c cyto <silverstone965@gmail.com> 1751896700 +0530 commit: added model name to convertor
@@ -4,3 +4,6 @@ ac9c9018c62fa30dc142665c1b5a375f4e056880 72f047cda92abcd1ddc857f6461de605f866833
4
4
  e91172752e9a421ae463112d2b0506b37498c98d 0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 cyto <silverstone965@gmail.com> 1749716371 +0530 update by push
5
5
  0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 eb75e1c49f1e5b79dca17ccdbec8067756523238 cyto <silverstone965@gmail.com> 1750856672 +0530 update by push
6
6
  eb75e1c49f1e5b79dca17ccdbec8067756523238 0bdb4169fc0f312b8698f1df17a258fff163aeaa cyto <silverstone965@gmail.com> 1750937389 +0530 update by push
7
+ 0bdb4169fc0f312b8698f1df17a258fff163aeaa 9528bbccd167e3f4ad583a1ae9fac98a52620e27 cyto <silverstone965@gmail.com> 1750947502 +0530 update by push
8
+ 9528bbccd167e3f4ad583a1ae9fac98a52620e27 ef0503a60244391590b16042019032e91d7cc30d cyto <silverstone965@gmail.com> 1751872581 +0530 update by push
9
+ ef0503a60244391590b16042019032e91d7cc30d af80ddb5890f062e364ea8ade2d602df4e12de8c cyto <silverstone965@gmail.com> 1751896713 +0530 update by push
@@ -0,0 +1,3 @@
1
+ x��Kj1D��)z� ��ĴZ-{�h����>2�AVU��Qom��9��j�5�d ��,��#��tQF&T�J|��ۀ�t̙�(�T��E�
2
+ 9U��i�%� �>�}z�Dz��~��q����pYߩ�OP���s ޤ3R�v
3
+ �*�.��z���a��Ԟ�t��¯����B���k�U�
@@ -0,0 +1 @@
1
+ af80ddb5890f062e364ea8ade2d602df4e12de8c
@@ -0,0 +1 @@
1
+ af80ddb5890f062e364ea8ade2d602df4e12de8c
@@ -31,10 +31,14 @@ EXCEL_FILE_TYPES= [
31
31
  class Convertor():
32
32
 
33
33
 
34
- def __init__(self, myfile: Path | None= None, output_dir: Path | None= None, file_bytes: bytes | None= None, suffix: str | None= None, file_type: str | None= None):
34
+ def __init__(self, myfile: Path | None= None, output_dir: Path | None= None, file_bytes: bytes | None= None, suffix: str | None= None, file_type: str | None= None, model_name: str | None = None):
35
35
 
36
36
  self.output= ""
37
37
 
38
+ if model_name is None:
39
+ # model_name= "gemini-2.5-flash"
40
+ model_name= "Nanonets-OCR-s"
41
+
38
42
  # file_type can be pdf, excel, etc.
39
43
  if output_dir is None and myfile is None and file_bytes is not None and suffix is not None:
40
44
  with tempfile.TemporaryDirectory() as dp:
@@ -43,7 +47,7 @@ class Convertor():
43
47
  myfile= Path(fp.name)
44
48
  output_dir= Path(dp)
45
49
  if file_type == 'pdf':
46
- extractor= MarkdownPDFExtractor(str(myfile), output_path= str(output_dir), page_delimiter= "-- NEXT PAGE --")
50
+ extractor= MarkdownPDFExtractor(str(myfile), output_path= str(output_dir), page_delimiter= "-- NEXT PAGE --", model_name= model_name)
47
51
  extractor.extract()
48
52
  with open(output_dir / (myfile.stem + '.md')) as output_file:
49
53
  self.output= output_file.read()
@@ -67,7 +71,7 @@ class Convertor():
67
71
  print("the file was json")
68
72
  elif mt == 'application/pdf':
69
73
  print("the file was pdf, outputting in: ", output_dir)
70
- extractor= MarkdownPDFExtractor(str(myfile), output_path= str(self.output_dir), page_delimiter= "-- NEXT PAGE --")
74
+ extractor= MarkdownPDFExtractor(str(myfile), output_path= str(self.output_dir), page_delimiter= "-- NEXT PAGE --", model_name= model_name)
71
75
  extractor.extract()
72
76
 
73
77
  elif mt in EXCEL_FILE_TYPES:
@@ -333,10 +337,10 @@ def chunk_text(text, chunk_size=500, overlap_size=50):
333
337
  if __name__ == '__main__':
334
338
  print("Test Run Start:")
335
339
  try:
336
- # print("Test 1: scaned pdf page, bytes")
337
- # with open("/home/cyto/Documents/scanned.pdf", "rb") as imgpdf:
338
- # conv= Convertor(file_bytes= imgpdf.read(), suffix= ".pdf", file_type= "pdf")
339
- # print(conv.output)
340
+ print("Test 1: scaned pdf page, bytes")
341
+ with open("/home/cyto/Documents/scanned.pdf", "rb") as imgpdf:
342
+ conv= Convertor(file_bytes= imgpdf.read(), suffix= ".pdf", file_type= "pdf")
343
+ print(conv.output)
340
344
 
341
345
  # print("Test 2: JD pdf, bytes")
342
346
  # with open("/home/cyto/dev/pembotdir/jds/PM Trainee.pdf", "rb") as imgpdf:
@@ -1,6 +1,6 @@
1
1
  """
2
2
  A Python Package to convert PEM blog content to usseful information by leveraging LLMs
3
3
  """
4
- __version__ = '0.0.5'
4
+ __version__ = '0.0.7'
5
5
  from .main import save_to_json_file, make_query
6
6
  __all__ = ["save_to_json_file", "make_query"]
@@ -2,4 +2,4 @@ OUTPUT_DIR: /home/cyto/dev/pembotdir
2
2
  PAGE_DELIMITER: ___________________________ NEXT PAGE ___________________________
3
3
  app:
4
4
  name: pembot
5
- version: 0.0.5
5
+ version: 0.0.7
@@ -0,0 +1 @@
1
+ handled the gpu errors non-gracefully so that it stops
@@ -9,3 +9,6 @@
9
9
  [branch "main"]
10
10
  remote = origin
11
11
  merge = refs/heads/main
12
+ [remote "myorigin"]
13
+ url = https://github.com/silverstone-git/pdf-to-markdown.git
14
+ fetch = +refs/heads/*:refs/remotes/myorigin/*
@@ -0,0 +1,5 @@
1
+ 0000000000000000000000000000000000000000 ffb759ee4605b232366a9ee58134532913c3f9e0 cyto <cyto@callisto.localdomain> 1747745478 +0530 clone: from https://github.com/iamarunbrahma/pdf-to-markdown
2
+ ffb759ee4605b232366a9ee58134532913c3f9e0 b8702320e56074e9680181d8b7897d6a0a552e2d cyto <silverstone965@gmail.com> 1750947962 +0530 commit: handled config loading errors gracefully; added gemini support, as an option; added huggingface nanonets transformers support (as an option); redesigned the extract markdown for captioning and image ocr (block image and full-page image);
3
+ b8702320e56074e9680181d8b7897d6a0a552e2d 14251b198e0bac39a3dc3b42f9e57b20c01465fb cyto <silverstone965@gmail.com> 1751604763 +0530 commit: removed deps on torch and transformers; used gradio client for ocr through public spaces;
4
+ 14251b198e0bac39a3dc3b42f9e57b20c01465fb b48d697aa9fd97151eb2a84a1af5d408b7630232 cyto <silverstone965@gmail.com> 1751871887 +0530 commit: cyto/argument-list-bug-fix;authentication-used-in-gradio-client
5
+ b48d697aa9fd97151eb2a84a1af5d408b7630232 f3b2d76c75bbd50e04fc4c2ad17fc94ca6daed32 cyto <silverstone965@gmail.com> 1751896628 +0530 commit: handled the gpu errors non-gracefully so that it stops
@@ -0,0 +1,5 @@
1
+ 0000000000000000000000000000000000000000 ffb759ee4605b232366a9ee58134532913c3f9e0 cyto <cyto@callisto.localdomain> 1747745478 +0530 clone: from https://github.com/iamarunbrahma/pdf-to-markdown
2
+ ffb759ee4605b232366a9ee58134532913c3f9e0 b8702320e56074e9680181d8b7897d6a0a552e2d cyto <silverstone965@gmail.com> 1750947962 +0530 commit: handled config loading errors gracefully; added gemini support, as an option; added huggingface nanonets transformers support (as an option); redesigned the extract markdown for captioning and image ocr (block image and full-page image);
3
+ b8702320e56074e9680181d8b7897d6a0a552e2d 14251b198e0bac39a3dc3b42f9e57b20c01465fb cyto <silverstone965@gmail.com> 1751604763 +0530 commit: removed deps on torch and transformers; used gradio client for ocr through public spaces;
4
+ 14251b198e0bac39a3dc3b42f9e57b20c01465fb b48d697aa9fd97151eb2a84a1af5d408b7630232 cyto <silverstone965@gmail.com> 1751871887 +0530 commit: cyto/argument-list-bug-fix;authentication-used-in-gradio-client
5
+ b48d697aa9fd97151eb2a84a1af5d408b7630232 f3b2d76c75bbd50e04fc4c2ad17fc94ca6daed32 cyto <silverstone965@gmail.com> 1751896628 +0530 commit: handled the gpu errors non-gracefully so that it stops
@@ -0,0 +1,4 @@
1
+ 0000000000000000000000000000000000000000 b8702320e56074e9680181d8b7897d6a0a552e2d cyto <silverstone965@gmail.com> 1750948073 +0530 update by push
2
+ b8702320e56074e9680181d8b7897d6a0a552e2d 14251b198e0bac39a3dc3b42f9e57b20c01465fb cyto <silverstone965@gmail.com> 1751604904 +0530 update by push
3
+ 14251b198e0bac39a3dc3b42f9e57b20c01465fb b48d697aa9fd97151eb2a84a1af5d408b7630232 cyto <silverstone965@gmail.com> 1751872077 +0530 update by push
4
+ b48d697aa9fd97151eb2a84a1af5d408b7630232 f3b2d76c75bbd50e04fc4c2ad17fc94ca6daed32 cyto <silverstone965@gmail.com> 1751896663 +0530 update by push
@@ -0,0 +1 @@
1
+ x��Kj!3vw� �l媷?`k�v��>�!�C'��:'Hk�f't:�lȺ�6g�u 2j�߈G�TV��ةN��gb�rp���F��ɚ���RI��<Z���
@@ -0,0 +1 @@
1
+ f3b2d76c75bbd50e04fc4c2ad17fc94ca6daed32
@@ -0,0 +1 @@
1
+ f3b2d76c75bbd50e04fc4c2ad17fc94ca6daed32
@@ -2,11 +2,9 @@ import fitz
2
2
  import pdfplumber
3
3
  import re
4
4
  import yaml
5
- # import pytesseract
5
+ import pytesseract
6
6
  import numpy as np
7
- from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText, VisionEncoderDecoderModel, ViTImageProcessor
8
7
  from typing import Literal, final
9
- import torch
10
8
  from PIL import Image
11
9
  import os
12
10
  import logging
@@ -19,6 +17,9 @@ import io
19
17
  from google import genai
20
18
  from google.genai import types
21
19
  import mimetypes
20
+ from gradio_client import Client, handle_file
21
+ import gradio as gr
22
+ import tempfile
22
23
 
23
24
 
24
25
 
@@ -75,25 +76,18 @@ class MarkdownPDFExtractor(PDFExtractor):
75
76
  super().__init__(pdf_path)
76
77
 
77
78
  if model_name is None:
78
- self.MODEL_NAME= "gemini-2.5-flash"
79
+ # self.MODEL_NAME= "gemini-2.5-flash"
80
+ self.MODEL_NAME= "Nanonets-OCR-s"
79
81
  else:
80
82
  self.MODEL_NAME= model_name
81
83
 
82
84
  if "gemini" in self.MODEL_NAME:
83
85
  self.gclient = genai.Client(api_key= os.getenv("GEMINI_API_KEY", ''))
84
- else:
85
- model_path = "nanonets/Nanonets-OCR-s"
86
- self.model = AutoModelForImageTextToText.from_pretrained(
87
- model_path,
88
- torch_dtype="auto",
89
- device_map="auto",
90
- attn_implementation="flash_attention_2"
91
- )
92
- self.model.eval()
93
- self.tokenizer = AutoTokenizer.from_pretrained(model_path)
94
- self.processor = AutoProcessor.from_pretrained(model_path)
95
- self.setup_image_captioning()
86
+ elif "anonet" in self.MODEL_NAME:
87
+ # self.nclient= Client("prithivMLmods/Multimodal-OCR2")
96
88
 
89
+ # zerogpu public
90
+ self.nclient= Client("deepak-mehta/ocr-simplify", hf_token= os.getenv('HF_TOKEN', ''))
97
91
 
98
92
 
99
93
  self.markdown_content= ""
@@ -108,25 +102,6 @@ class MarkdownPDFExtractor(PDFExtractor):
108
102
 
109
103
 
110
104
 
111
- def setup_image_captioning(self):
112
- """Set up the image captioning model."""
113
- try:
114
- self.model = VisionEncoderDecoderModel.from_pretrained(
115
- "nlpconnect/vit-gpt2-image-captioning"
116
- )
117
- self.feature_extractor = ViTImageProcessor.from_pretrained(
118
- "nlpconnect/vit-gpt2-image-captioning"
119
- )
120
- self.tokenizer = AutoTokenizer.from_pretrained(
121
- "nlpconnect/vit-gpt2-image-captioning"
122
- )
123
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
124
- self.model.to(self.device)
125
- self.logger.info("Image captioning model set up successfully.")
126
- except Exception as e:
127
- self.logger.error(f"Error setting up image captioning model: {e}")
128
- self.logger.exception(traceback.format_exc())
129
-
130
105
  def extract(self):
131
106
  try:
132
107
  markdown_content, markdown_pages = self.extract_markdown()
@@ -140,15 +115,25 @@ class MarkdownPDFExtractor(PDFExtractor):
140
115
  except Exception as e:
141
116
  self.logger.error(f"Error processing PDF: {e}")
142
117
  self.logger.exception(traceback.format_exc())
118
+
119
+ error_message= str(e).lower()
120
+ if "GPU" in error_message and "quota" in error_message:
121
+ return "GPU quota error", []
143
122
  return "", []
144
123
 
145
124
 
146
- def ocr_page_with_nanonets_s(self, pil_image, img_bytes, max_new_tokens: int | None = None):
147
- prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""
125
+ def image_ocr(self, pil_image, img_bytes, max_new_tokens: int | None = None, prompt: str | None= None):
126
+ if prompt is None:
127
+ prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""
148
128
  if max_new_tokens is None:
149
129
  max_new_tokens= 4096
150
130
 
151
- if 'gemini' in self.MODEL_NAME:
131
+ w, h= pil_image.size
132
+ if w < 200 or h < 50:
133
+ return "<img> A small image </img>"
134
+
135
+ model_name= self.MODEL_NAME.lower()
136
+ if 'gemini' in model_name:
152
137
 
153
138
  image_format = pil_image.format
154
139
  dummy_filename = f"dummy.{image_format.lower()}"
@@ -165,24 +150,46 @@ class MarkdownPDFExtractor(PDFExtractor):
165
150
  )
166
151
  # print("response :", response)
167
152
  return response.text
168
- else:
169
- image = pil_image
170
- messages = [
171
- {"role": "system", "content": "You are a helpful assistant."},
172
- {"role": "user", "content": [
173
- {"type": "image", "image": image},
174
- {"type": "text", "text": prompt},
175
- ]},
176
- ]
177
- text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
178
- inputs = self.processor(text=[text], images=[image], padding=True, return_tensors="pt")
179
- inputs = inputs.to(self.model.device)
180
-
181
- output_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
182
- generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
153
+ elif 'nanonet' in model_name:
183
154
 
184
- output_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
185
- return output_text[0]
155
+ result= ""
156
+ try:
157
+ with tempfile.NamedTemporaryFile(suffix=f'.{pil_image.format.lower()}', mode= 'w') as temp_file:
158
+ pil_image.save(temp_file.name)
159
+ print("file name: ", temp_file.name)
160
+ gr_image= handle_file(temp_file.name)
161
+ print("gr image : ", gr_image)
162
+ result = self.nclient.predict(
163
+ # model_name="Nanonets-OCR-s",
164
+ # text= prompt,
165
+ gr_image,
166
+ # max_new_tokens=max_new_tokens,
167
+ # temperature=0.6,
168
+ # top_p=0.9,
169
+ # top_k=50,
170
+ # repetition_penalty=1.2,
171
+
172
+ # prithiv model
173
+ # api_name="/generate_image"
174
+
175
+ max_new_tokens,
176
+
177
+ # spaces zerogpu
178
+ api_name="/predict"
179
+ )
180
+ print("ocr'd: ", result[:100] + "...")
181
+ except Exception as e:
182
+ print("Error during nanonet inference", e)
183
+ error_message = str(e)
184
+ if "You have exceeded your Pro GPU quota" in error_message:
185
+ # print("\n\n\nFALLING BACK TO TESS\n\n\n")
186
+ # return pytesseract.image_to_string(pil_image)
187
+ raise e
188
+
189
+
190
+ return result
191
+ else:
192
+ return pytesseract.image_to_string(pil_image)
186
193
 
187
194
 
188
195
 
@@ -219,7 +226,7 @@ class MarkdownPDFExtractor(PDFExtractor):
219
226
  for page_num, page in enumerate(doc):
220
227
  current_page_markdown_blocks = [] # Collect markdown blocks for the current page
221
228
  page_has_searchable_text = False
222
- page_has_embedded_images = False
229
+ # page_has_embedded_images = False
223
230
 
224
231
  self.logger.info(f"\nProcessing page {page_num + 1}...")
225
232
 
@@ -252,7 +259,7 @@ class MarkdownPDFExtractor(PDFExtractor):
252
259
  try:
253
260
  image_bytes= io.BytesIO(img_data)
254
261
  pil_image = Image.open(image_bytes)
255
- ocr_text_from_block_image = self.ocr_page_with_nanonets_s(
262
+ ocr_text_from_block_image = self.image_ocr(
256
263
  pil_image, image_bytes, max_new_tokens=15000
257
264
  )
258
265
 
@@ -265,6 +272,9 @@ class MarkdownPDFExtractor(PDFExtractor):
265
272
  except Exception as e:
266
273
  self.logger.error(f" Error processing embedded image block for OCR: {e}")
267
274
  current_page_markdown_blocks.append("\n\n![Image Processing Error](error_on_page_{page_num+1}_block_{block_num+1}.png)\n\n")
275
+ error_message= str(e).lower()
276
+ if "GPU" in error_message and "quota" in error_message:
277
+ raise e
268
278
 
269
279
 
270
280
  # Insert tables at their approximate positions (after blocks are processed for the page)
@@ -292,7 +302,7 @@ class MarkdownPDFExtractor(PDFExtractor):
292
302
  image_bytestream= io.BytesIO(img_bytes)
293
303
  pil_image = Image.open(image_bytestream)
294
304
 
295
- ocr_text_from_page = self.ocr_page_with_nanonets_s(
305
+ ocr_text_from_page = self.image_ocr(
296
306
  pil_image, image_bytestream, max_new_tokens=15000
297
307
  )
298
308
 
@@ -309,6 +319,9 @@ class MarkdownPDFExtractor(PDFExtractor):
309
319
  self.logger.info(f" Full-page OCR yielded no text for page {page_num+1}.")
310
320
  except Exception as e:
311
321
  self.logger.error(f" Error during full-page OCR on page {page_num+1}: {e}")
322
+ error_message= str(e).lower()
323
+ if "GPU" in error_message and "quota" in error_message:
324
+ raise e
312
325
  else:
313
326
  self.logger.info(f" Page {page_num + 1} has sufficient searchable text or embedded image OCR; skipping full-page OCR.")
314
327
 
@@ -332,7 +345,12 @@ class MarkdownPDFExtractor(PDFExtractor):
332
345
  except Exception as e:
333
346
  self.logger.critical(f"An unexpected error occurred during markdown extraction: {e}")
334
347
  self.logger.exception(traceback.format_exc())
335
- return "", []
348
+
349
+ error_message= str(e).lower()
350
+ if "GPU" in error_message and "quota" in error_message:
351
+ return "GPU quota error", []
352
+ else:
353
+ return "", []
336
354
 
337
355
  def extract_tables(self):
338
356
  """Extract tables from PDF using pdfplumber."""
@@ -389,7 +407,7 @@ class MarkdownPDFExtractor(PDFExtractor):
389
407
  # ocr_result = pytesseract.image_to_string(
390
408
  # image
391
409
  # )
392
- ocr_result= self.ocr_page_with_nanonets_s(image, image_bytes, max_new_tokens=15000)
410
+ ocr_result= self.image_ocr(image, image_bytes, max_new_tokens=15000)
393
411
 
394
412
 
395
413
  return ocr_result.strip()
@@ -409,41 +427,15 @@ class MarkdownPDFExtractor(PDFExtractor):
409
427
  if image.mode != "RGB":
410
428
  image = image.convert("RGB")
411
429
 
412
- image_format = image.format
413
- dummy_filename = f"dummy.{image_format.lower()}"
414
- mime_type, _ = mimetypes.guess_type(dummy_filename)
415
-
416
- if "gemini" in self.MODEL_NAME:
417
- response= self.gclient.models.generate_content(
418
- model= self.MODEL_NAME,
419
- contents=[
420
- types.Part.from_bytes(
421
- data=image_bytes.getvalue(),
422
- mime_type= mime_type
423
- ),
424
- "Write a caption for this image"
425
- ]
426
- )
427
- return response.text
428
- else:
429
- # Ensure the image is in the correct shape
430
- image = np.array(image).transpose(2, 0, 1) # Convert to (C, H, W) format
430
+ caption= self.image_ocr(image, image_bytes, max_new_tokens=15000, prompt= "Write a caption for this image")
431
+ return caption
431
432
 
432
- inputs = self.feature_extractor(images=image, return_tensors="pt").to(
433
- self.device
434
- )
435
- pixel_values = inputs.pixel_values
436
-
437
- generated_ids = self.model.generate(pixel_values, max_length=30)
438
-
439
- generated_ids = self.model.generate(pixel_values, max_length=30)
440
- generated_caption = self.tokenizer.batch_decode(
441
- generated_ids, skip_special_tokens=True
442
- )[0]
443
- return generated_caption.strip()
444
433
  except Exception as e:
445
434
  self.logger.error(f"Error captioning image: {e}")
446
435
  self.logger.exception(traceback.format_exc())
436
+ error_message= str(e)
437
+ if "GPU" in error_message and "quota" in error_message:
438
+ raise e
447
439
  return ""
448
440
 
449
441
  def clean_text(self, text):
@@ -758,6 +750,7 @@ class MarkdownPDFExtractor(PDFExtractor):
758
750
  self.logger.exception(traceback.format_exc())
759
751
  return ""
760
752
 
753
+
761
754
  def get_header_level(self, font_size):
762
755
  """Determine header level based on font size."""
763
756
  if font_size > 24:
@@ -0,0 +1,4 @@
1
+ {
2
+ "venvPath": "../..",
3
+ "venv": "venvpem"
4
+ }
@@ -0,0 +1,80 @@
1
+ aiofiles==24.1.0
2
+ annotated-types==0.7.0
3
+ anyio==4.9.0
4
+ audioop-lts==0.2.1
5
+ cachetools==5.5.2
6
+ certifi==2025.6.15
7
+ cffi==1.17.1
8
+ charset-normalizer==3.4.2
9
+ click==8.2.1
10
+ cryptography==45.0.5
11
+ dnspython==2.7.0
12
+ et_xmlfile==2.0.0
13
+ fastapi==0.115.14
14
+ ffmpy==0.6.0
15
+ filelock==3.18.0
16
+ fsspec==2025.5.1
17
+ google-auth==2.40.3
18
+ google-genai==1.24.0
19
+ gradio==5.35.0
20
+ gradio_client==1.10.4
21
+ greenlet==3.2.3
22
+ groovy==0.1.2
23
+ h11==0.16.0
24
+ hf-xet==1.1.5
25
+ httpcore==1.0.9
26
+ httpx==0.28.1
27
+ huggingface-hub==0.33.2
28
+ idna==3.10
29
+ Jinja2==3.1.6
30
+ markdown-it-py==3.0.0
31
+ MarkupSafe==3.0.2
32
+ mdurl==0.1.2
33
+ msgpack==1.1.1
34
+ numpy==2.3.1
35
+ ollama==0.5.1
36
+ openpyxl==3.1.5
37
+ orjson==3.10.18
38
+ packaging==25.0
39
+ pandas==2.3.0
40
+ pathlib==1.0.1
41
+ pdfminer.six==20250506
42
+ pdfplumber==0.11.7
43
+ pembot==0.0.6
44
+ pillow==11.3.0
45
+ pyasn1==0.6.1
46
+ pyasn1_modules==0.4.2
47
+ pycparser==2.22
48
+ pydantic==2.11.7
49
+ pydantic_core==2.33.2
50
+ pydub==0.25.1
51
+ Pygments==2.19.2
52
+ pymongo==4.13.2
53
+ PyMuPDF==1.26.3
54
+ pynvim==0.5.2
55
+ pypdfium2==4.30.1
56
+ pytesseract==0.3.13
57
+ python-dateutil==2.9.0.post0
58
+ python-multipart==0.0.20
59
+ pytz==2025.2
60
+ PyYAML==6.0.2
61
+ requests==2.32.4
62
+ rich==14.0.0
63
+ rsa==4.9.1
64
+ ruff==0.12.1
65
+ safehttpx==0.1.6
66
+ semantic-version==2.10.0
67
+ shellingham==1.5.4
68
+ six==1.17.0
69
+ sniffio==1.3.1
70
+ starlette==0.46.2
71
+ tenacity==8.5.0
72
+ tomlkit==0.13.3
73
+ tqdm==4.67.1
74
+ typer==0.16.0
75
+ typing-inspection==0.4.1
76
+ typing_extensions==4.14.0
77
+ tzdata==2025.2
78
+ urllib3==2.5.0
79
+ uvicorn==0.35.0
80
+ websockets==15.0.1
@@ -1 +0,0 @@
1
- fixed the output_dir bug; fixed the excel to json function; ran some tests on convertor; incremented the version on the package; removed dependency on schema / structure, and shifted required fields to a pickle file path in the cli args;
Binary file
@@ -1 +0,0 @@
1
- 0bdb4169fc0f312b8698f1df17a258fff163aeaa
@@ -1 +0,0 @@
1
- 0bdb4169fc0f312b8698f1df17a258fff163aeaa
Binary file