bbot 2.0.1.4654rc0__py3-none-any.whl → 2.3.0.5397rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bbot might be problematic. Click here for more details.

Files changed (270) hide show
  1. bbot/__init__.py +1 -1
  2. bbot/cli.py +3 -7
  3. bbot/core/config/files.py +0 -1
  4. bbot/core/config/logger.py +34 -4
  5. bbot/core/core.py +21 -6
  6. bbot/core/engine.py +9 -8
  7. bbot/core/event/base.py +162 -63
  8. bbot/core/helpers/bloom.py +10 -3
  9. bbot/core/helpers/command.py +9 -8
  10. bbot/core/helpers/depsinstaller/installer.py +89 -32
  11. bbot/core/helpers/depsinstaller/sudo_askpass.py +38 -2
  12. bbot/core/helpers/diff.py +10 -10
  13. bbot/core/helpers/dns/brute.py +18 -14
  14. bbot/core/helpers/dns/dns.py +16 -15
  15. bbot/core/helpers/dns/engine.py +159 -132
  16. bbot/core/helpers/dns/helpers.py +2 -2
  17. bbot/core/helpers/dns/mock.py +26 -8
  18. bbot/core/helpers/files.py +1 -1
  19. bbot/core/helpers/helper.py +7 -4
  20. bbot/core/helpers/interactsh.py +3 -3
  21. bbot/core/helpers/libmagic.py +65 -0
  22. bbot/core/helpers/misc.py +65 -22
  23. bbot/core/helpers/names_generator.py +17 -3
  24. bbot/core/helpers/process.py +0 -20
  25. bbot/core/helpers/regex.py +1 -1
  26. bbot/core/helpers/regexes.py +12 -6
  27. bbot/core/helpers/validators.py +1 -2
  28. bbot/core/helpers/web/client.py +1 -1
  29. bbot/core/helpers/web/engine.py +18 -13
  30. bbot/core/helpers/web/web.py +25 -116
  31. bbot/core/helpers/wordcloud.py +5 -5
  32. bbot/core/modules.py +36 -27
  33. bbot/core/multiprocess.py +58 -0
  34. bbot/core/shared_deps.py +46 -3
  35. bbot/db/sql/models.py +147 -0
  36. bbot/defaults.yml +15 -10
  37. bbot/errors.py +0 -8
  38. bbot/modules/anubisdb.py +2 -2
  39. bbot/modules/apkpure.py +63 -0
  40. bbot/modules/azure_tenant.py +2 -2
  41. bbot/modules/baddns.py +35 -19
  42. bbot/modules/baddns_direct.py +92 -0
  43. bbot/modules/baddns_zone.py +3 -8
  44. bbot/modules/badsecrets.py +4 -3
  45. bbot/modules/base.py +195 -51
  46. bbot/modules/bevigil.py +7 -7
  47. bbot/modules/binaryedge.py +7 -4
  48. bbot/modules/bufferoverrun.py +47 -0
  49. bbot/modules/builtwith.py +6 -10
  50. bbot/modules/bypass403.py +5 -5
  51. bbot/modules/c99.py +10 -7
  52. bbot/modules/censys.py +9 -13
  53. bbot/modules/certspotter.py +5 -3
  54. bbot/modules/chaos.py +9 -7
  55. bbot/modules/code_repository.py +1 -0
  56. bbot/modules/columbus.py +3 -3
  57. bbot/modules/crt.py +5 -3
  58. bbot/modules/deadly/dastardly.py +1 -1
  59. bbot/modules/deadly/ffuf.py +9 -9
  60. bbot/modules/deadly/nuclei.py +3 -3
  61. bbot/modules/deadly/vhost.py +4 -3
  62. bbot/modules/dehashed.py +1 -1
  63. bbot/modules/digitorus.py +1 -1
  64. bbot/modules/dnsbimi.py +145 -0
  65. bbot/modules/dnscaa.py +3 -3
  66. bbot/modules/dnsdumpster.py +4 -4
  67. bbot/modules/dnstlsrpt.py +144 -0
  68. bbot/modules/docker_pull.py +7 -5
  69. bbot/modules/dockerhub.py +2 -2
  70. bbot/modules/dotnetnuke.py +18 -19
  71. bbot/modules/emailformat.py +1 -1
  72. bbot/modules/extractous.py +122 -0
  73. bbot/modules/filedownload.py +9 -7
  74. bbot/modules/fullhunt.py +7 -4
  75. bbot/modules/generic_ssrf.py +5 -5
  76. bbot/modules/github_codesearch.py +3 -2
  77. bbot/modules/github_org.py +4 -4
  78. bbot/modules/github_workflows.py +4 -4
  79. bbot/modules/gitlab.py +2 -5
  80. bbot/modules/google_playstore.py +93 -0
  81. bbot/modules/gowitness.py +48 -50
  82. bbot/modules/hackertarget.py +5 -3
  83. bbot/modules/host_header.py +5 -5
  84. bbot/modules/httpx.py +1 -4
  85. bbot/modules/hunterio.py +3 -9
  86. bbot/modules/iis_shortnames.py +19 -30
  87. bbot/modules/internal/cloudcheck.py +27 -12
  88. bbot/modules/internal/dnsresolve.py +250 -276
  89. bbot/modules/internal/excavate.py +100 -64
  90. bbot/modules/internal/speculate.py +42 -33
  91. bbot/modules/internetdb.py +4 -2
  92. bbot/modules/ip2location.py +3 -5
  93. bbot/modules/ipneighbor.py +1 -1
  94. bbot/modules/ipstack.py +3 -8
  95. bbot/modules/jadx.py +87 -0
  96. bbot/modules/leakix.py +11 -10
  97. bbot/modules/myssl.py +2 -2
  98. bbot/modules/newsletters.py +2 -2
  99. bbot/modules/otx.py +5 -3
  100. bbot/modules/output/asset_inventory.py +7 -7
  101. bbot/modules/output/base.py +1 -1
  102. bbot/modules/output/csv.py +1 -2
  103. bbot/modules/output/http.py +20 -14
  104. bbot/modules/output/mysql.py +51 -0
  105. bbot/modules/output/neo4j.py +7 -2
  106. bbot/modules/output/postgres.py +49 -0
  107. bbot/modules/output/slack.py +0 -1
  108. bbot/modules/output/sqlite.py +29 -0
  109. bbot/modules/output/stdout.py +2 -2
  110. bbot/modules/output/teams.py +107 -6
  111. bbot/modules/paramminer_headers.py +5 -8
  112. bbot/modules/passivetotal.py +13 -13
  113. bbot/modules/portscan.py +32 -6
  114. bbot/modules/postman.py +50 -126
  115. bbot/modules/postman_download.py +220 -0
  116. bbot/modules/rapiddns.py +3 -8
  117. bbot/modules/report/asn.py +11 -11
  118. bbot/modules/robots.py +3 -3
  119. bbot/modules/securitytrails.py +7 -10
  120. bbot/modules/securitytxt.py +128 -0
  121. bbot/modules/shodan_dns.py +7 -9
  122. bbot/modules/sitedossier.py +1 -1
  123. bbot/modules/skymem.py +2 -2
  124. bbot/modules/social.py +2 -1
  125. bbot/modules/subdomaincenter.py +1 -1
  126. bbot/modules/subdomainradar.py +160 -0
  127. bbot/modules/telerik.py +8 -8
  128. bbot/modules/templates/bucket.py +1 -1
  129. bbot/modules/templates/github.py +22 -14
  130. bbot/modules/templates/postman.py +21 -0
  131. bbot/modules/templates/shodan.py +14 -13
  132. bbot/modules/templates/sql.py +95 -0
  133. bbot/modules/templates/subdomain_enum.py +53 -17
  134. bbot/modules/templates/webhook.py +2 -4
  135. bbot/modules/trickest.py +8 -37
  136. bbot/modules/trufflehog.py +18 -3
  137. bbot/modules/url_manipulation.py +3 -3
  138. bbot/modules/urlscan.py +1 -1
  139. bbot/modules/viewdns.py +1 -1
  140. bbot/modules/virustotal.py +8 -30
  141. bbot/modules/wafw00f.py +1 -1
  142. bbot/modules/wayback.py +1 -1
  143. bbot/modules/wpscan.py +17 -11
  144. bbot/modules/zoomeye.py +11 -6
  145. bbot/presets/baddns-thorough.yml +12 -0
  146. bbot/presets/fast.yml +16 -0
  147. bbot/presets/kitchen-sink.yml +1 -0
  148. bbot/presets/spider.yml +4 -0
  149. bbot/presets/subdomain-enum.yml +7 -7
  150. bbot/scanner/manager.py +5 -16
  151. bbot/scanner/preset/args.py +44 -26
  152. bbot/scanner/preset/environ.py +7 -2
  153. bbot/scanner/preset/path.py +7 -4
  154. bbot/scanner/preset/preset.py +36 -23
  155. bbot/scanner/scanner.py +176 -63
  156. bbot/scanner/target.py +236 -434
  157. bbot/scripts/docs.py +1 -1
  158. bbot/test/bbot_fixtures.py +22 -3
  159. bbot/test/conftest.py +132 -100
  160. bbot/test/fastapi_test.py +17 -0
  161. bbot/test/owasp_mastg.apk +0 -0
  162. bbot/test/run_tests.sh +4 -4
  163. bbot/test/test.conf +2 -0
  164. bbot/test/test_step_1/test_bbot_fastapi.py +82 -0
  165. bbot/test/test_step_1/test_bloom_filter.py +2 -0
  166. bbot/test/test_step_1/test_cli.py +138 -64
  167. bbot/test/test_step_1/test_dns.py +392 -70
  168. bbot/test/test_step_1/test_engine.py +17 -17
  169. bbot/test/test_step_1/test_events.py +203 -37
  170. bbot/test/test_step_1/test_helpers.py +64 -28
  171. bbot/test/test_step_1/test_manager_deduplication.py +1 -1
  172. bbot/test/test_step_1/test_manager_scope_accuracy.py +336 -338
  173. bbot/test/test_step_1/test_modules_basic.py +69 -71
  174. bbot/test/test_step_1/test_presets.py +184 -96
  175. bbot/test/test_step_1/test_python_api.py +7 -2
  176. bbot/test/test_step_1/test_regexes.py +35 -5
  177. bbot/test/test_step_1/test_scan.py +39 -5
  178. bbot/test/test_step_1/test_scope.py +5 -4
  179. bbot/test/test_step_1/test_target.py +243 -145
  180. bbot/test/test_step_1/test_web.py +48 -10
  181. bbot/test/test_step_2/module_tests/base.py +17 -20
  182. bbot/test/test_step_2/module_tests/test_module_anubisdb.py +1 -1
  183. bbot/test/test_step_2/module_tests/test_module_apkpure.py +71 -0
  184. bbot/test/test_step_2/module_tests/test_module_asset_inventory.py +0 -1
  185. bbot/test/test_step_2/module_tests/test_module_azure_realm.py +1 -1
  186. bbot/test/test_step_2/module_tests/test_module_baddns.py +6 -6
  187. bbot/test/test_step_2/module_tests/test_module_baddns_direct.py +62 -0
  188. bbot/test/test_step_2/module_tests/test_module_bevigil.py +29 -2
  189. bbot/test/test_step_2/module_tests/test_module_binaryedge.py +4 -2
  190. bbot/test/test_step_2/module_tests/test_module_bucket_amazon.py +2 -2
  191. bbot/test/test_step_2/module_tests/test_module_bucket_azure.py +1 -1
  192. bbot/test/test_step_2/module_tests/test_module_bufferoverrun.py +35 -0
  193. bbot/test/test_step_2/module_tests/test_module_builtwith.py +2 -2
  194. bbot/test/test_step_2/module_tests/test_module_bypass403.py +1 -1
  195. bbot/test/test_step_2/module_tests/test_module_c99.py +126 -0
  196. bbot/test/test_step_2/module_tests/test_module_censys.py +4 -1
  197. bbot/test/test_step_2/module_tests/test_module_cloudcheck.py +4 -0
  198. bbot/test/test_step_2/module_tests/test_module_code_repository.py +11 -1
  199. bbot/test/test_step_2/module_tests/test_module_columbus.py +1 -1
  200. bbot/test/test_step_2/module_tests/test_module_credshed.py +3 -3
  201. bbot/test/test_step_2/module_tests/test_module_dastardly.py +2 -1
  202. bbot/test/test_step_2/module_tests/test_module_dehashed.py +2 -2
  203. bbot/test/test_step_2/module_tests/test_module_digitorus.py +1 -1
  204. bbot/test/test_step_2/module_tests/test_module_discord.py +1 -1
  205. bbot/test/test_step_2/module_tests/test_module_dnsbimi.py +103 -0
  206. bbot/test/test_step_2/module_tests/test_module_dnsbrute.py +9 -10
  207. bbot/test/test_step_2/module_tests/test_module_dnsbrute_mutations.py +1 -2
  208. bbot/test/test_step_2/module_tests/test_module_dnscommonsrv.py +1 -2
  209. bbot/test/test_step_2/module_tests/test_module_dnsdumpster.py +4 -4
  210. bbot/test/test_step_2/module_tests/test_module_dnstlsrpt.py +64 -0
  211. bbot/test/test_step_2/module_tests/test_module_dotnetnuke.py +0 -8
  212. bbot/test/test_step_2/module_tests/test_module_excavate.py +17 -37
  213. bbot/test/test_step_2/module_tests/test_module_extractous.py +54 -0
  214. bbot/test/test_step_2/module_tests/test_module_ffuf_shortnames.py +1 -1
  215. bbot/test/test_step_2/module_tests/test_module_filedownload.py +14 -14
  216. bbot/test/test_step_2/module_tests/test_module_git_clone.py +2 -2
  217. bbot/test/test_step_2/module_tests/test_module_github_org.py +19 -8
  218. bbot/test/test_step_2/module_tests/test_module_github_workflows.py +1 -1
  219. bbot/test/test_step_2/module_tests/test_module_gitlab.py +9 -4
  220. bbot/test/test_step_2/module_tests/test_module_google_playstore.py +83 -0
  221. bbot/test/test_step_2/module_tests/test_module_gowitness.py +4 -4
  222. bbot/test/test_step_2/module_tests/test_module_host_header.py +1 -1
  223. bbot/test/test_step_2/module_tests/test_module_http.py +4 -4
  224. bbot/test/test_step_2/module_tests/test_module_httpx.py +10 -8
  225. bbot/test/test_step_2/module_tests/test_module_hunterio.py +68 -4
  226. bbot/test/test_step_2/module_tests/test_module_jadx.py +55 -0
  227. bbot/test/test_step_2/module_tests/test_module_json.py +24 -11
  228. bbot/test/test_step_2/module_tests/test_module_leakix.py +7 -3
  229. bbot/test/test_step_2/module_tests/test_module_mysql.py +76 -0
  230. bbot/test/test_step_2/module_tests/test_module_myssl.py +1 -1
  231. bbot/test/test_step_2/module_tests/test_module_neo4j.py +1 -1
  232. bbot/test/test_step_2/module_tests/test_module_newsletters.py +6 -6
  233. bbot/test/test_step_2/module_tests/test_module_ntlm.py +7 -7
  234. bbot/test/test_step_2/module_tests/test_module_oauth.py +1 -1
  235. bbot/test/test_step_2/module_tests/test_module_otx.py +1 -1
  236. bbot/test/test_step_2/module_tests/test_module_paramminer_cookies.py +1 -2
  237. bbot/test/test_step_2/module_tests/test_module_paramminer_getparams.py +0 -6
  238. bbot/test/test_step_2/module_tests/test_module_paramminer_headers.py +2 -9
  239. bbot/test/test_step_2/module_tests/test_module_passivetotal.py +3 -1
  240. bbot/test/test_step_2/module_tests/test_module_portscan.py +9 -8
  241. bbot/test/test_step_2/module_tests/test_module_postgres.py +74 -0
  242. bbot/test/test_step_2/module_tests/test_module_postman.py +84 -253
  243. bbot/test/test_step_2/module_tests/test_module_postman_download.py +439 -0
  244. bbot/test/test_step_2/module_tests/test_module_rapiddns.py +93 -1
  245. bbot/test/test_step_2/module_tests/test_module_securitytxt.py +50 -0
  246. bbot/test/test_step_2/module_tests/test_module_shodan_dns.py +20 -1
  247. bbot/test/test_step_2/module_tests/test_module_sitedossier.py +2 -2
  248. bbot/test/test_step_2/module_tests/test_module_smuggler.py +1 -1
  249. bbot/test/test_step_2/module_tests/test_module_social.py +11 -1
  250. bbot/test/test_step_2/module_tests/test_module_speculate.py +2 -6
  251. bbot/test/test_step_2/module_tests/test_module_splunk.py +4 -4
  252. bbot/test/test_step_2/module_tests/test_module_sqlite.py +18 -0
  253. bbot/test/test_step_2/module_tests/test_module_sslcert.py +1 -1
  254. bbot/test/test_step_2/module_tests/test_module_stdout.py +5 -3
  255. bbot/test/test_step_2/module_tests/test_module_subdomaincenter.py +1 -1
  256. bbot/test/test_step_2/module_tests/test_module_subdomainradar.py +208 -0
  257. bbot/test/test_step_2/module_tests/test_module_subdomains.py +1 -1
  258. bbot/test/test_step_2/module_tests/test_module_teams.py +8 -6
  259. bbot/test/test_step_2/module_tests/test_module_telerik.py +1 -1
  260. bbot/test/test_step_2/module_tests/test_module_trufflehog.py +317 -11
  261. bbot/test/test_step_2/module_tests/test_module_wayback.py +1 -1
  262. bbot/test/test_step_2/template_tests/test_template_subdomain_enum.py +135 -0
  263. {bbot-2.0.1.4654rc0.dist-info → bbot-2.3.0.5397rc0.dist-info}/METADATA +48 -18
  264. bbot-2.3.0.5397rc0.dist-info/RECORD +421 -0
  265. {bbot-2.0.1.4654rc0.dist-info → bbot-2.3.0.5397rc0.dist-info}/WHEEL +1 -1
  266. bbot/modules/unstructured.py +0 -163
  267. bbot/test/test_step_2/module_tests/test_module_unstructured.py +0 -102
  268. bbot-2.0.1.4654rc0.dist-info/RECORD +0 -385
  269. {bbot-2.0.1.4654rc0.dist-info → bbot-2.3.0.5397rc0.dist-info}/LICENSE +0 -0
  270. {bbot-2.0.1.4654rc0.dist-info → bbot-2.3.0.5397rc0.dist-info}/entry_points.txt +0 -0
@@ -1,163 +0,0 @@
1
- import os
2
- from pathlib import Path
3
-
4
- from bbot.modules.base import BaseModule
5
-
6
-
7
- class unstructured(BaseModule):
8
- watched_events = ["FILESYSTEM"]
9
- produced_events = ["FILESYSTEM", "RAW_TEXT"]
10
- flags = ["passive", "safe"]
11
- meta = {
12
- "description": "Module to extract data from files",
13
- "created_date": "2024-06-03",
14
- "author": "@domwhewell-sage",
15
- }
16
- options = {
17
- "extensions": [
18
- "bak", # Backup File
19
- "bash", # Bash Script or Configuration
20
- "bashrc", # Bash Script or Configuration
21
- "conf", # Configuration File
22
- "cfg", # Configuration File
23
- "crt", # Certificate File
24
- "csv", # Comma Separated Values File
25
- "db", # SQLite Database File
26
- "sqlite", # SQLite Database File
27
- "doc", # Microsoft Word Document (Old Format)
28
- "docx", # Microsoft Word Document
29
- "ica", # Citrix Independent Computing Architecture File
30
- "indd", # Adobe InDesign Document
31
- "ini", # Initialization File
32
- "key", # Private Key File
33
- "pub", # Public Key File
34
- "log", # Log File
35
- "markdown", # Markdown File
36
- "md", # Markdown File
37
- "odg", # OpenDocument Graphics (LibreOffice, OpenOffice)
38
- "odp", # OpenDocument Presentation (LibreOffice, OpenOffice)
39
- "ods", # OpenDocument Spreadsheet (LibreOffice, OpenOffice)
40
- "odt", # OpenDocument Text (LibreOffice, OpenOffice)
41
- "pdf", # Adobe Portable Document Format
42
- "pem", # Privacy Enhanced Mail (SSL certificate)
43
- "pps", # Microsoft PowerPoint Slideshow (Old Format)
44
- "ppsx", # Microsoft PowerPoint Slideshow
45
- "ppt", # Microsoft PowerPoint Presentation (Old Format)
46
- "pptx", # Microsoft PowerPoint Presentation
47
- "ps1", # PowerShell Script
48
- "rdp", # Remote Desktop Protocol File
49
- "sh", # Shell Script
50
- "sql", # SQL Database Dump
51
- "swp", # Swap File (temporary file, often Vim)
52
- "sxw", # OpenOffice.org Writer document
53
- "txt", # Plain Text Document
54
- "vbs", # Visual Basic Script
55
- "wpd", # WordPerfect Document
56
- "xls", # Microsoft Excel Spreadsheet (Old Format)
57
- "xlsx", # Microsoft Excel Spreadsheet
58
- "xml", # eXtensible Markup Language File
59
- "yml", # YAML Ain't Markup Language
60
- "yaml", # YAML Ain't Markup Language
61
- ],
62
- "ignore_folders": [".git"],
63
- }
64
- options_desc = {
65
- "extensions": "File extensions to parse",
66
- "ignore_folders": "Subfolders to ignore when crawling downloaded folders",
67
- }
68
-
69
- deps_apt = ["libmagic-dev", "poppler-utils", "tesseract-ocr", "libreoffice", "pandoc"]
70
- deps_pip = ["unstructured[all-docs]>=0.15.7,<1.0", "nltk>=3.9.0,<4.0"]
71
-
72
- scope_distance_modifier = 1
73
-
74
- async def setup(self):
75
- self.extensions = list(set([e.lower().strip(".") for e in self.config.get("extensions", [])]))
76
- self.ignored_folders = self.config.get("ignore_folders", [])
77
- # Do not send user statistics to the unstructured library
78
- os.environ["SCARF_NO_ANALYTICS"] = "true"
79
- return True
80
-
81
- async def filter_event(self, event):
82
- if "file" not in event.tags and "folder" not in event.tags:
83
- return False, "Event is not a file or folder"
84
- if "file" in event.tags:
85
- if not any(event.data["path"].endswith(f".{ext}") for ext in self.extensions):
86
- return False, "File extension not in the allowed list"
87
- return True
88
-
89
- async def handle_event(self, event):
90
- if "folder" in event.tags:
91
- folder_path = Path(event.data["path"])
92
- for file_path in folder_path.rglob("*"):
93
- # If the file is not in an ignored folder and if it has an allowed extension raise it as a FILESYSTEM event
94
- if not any(ignored_folder in str(file_path) for ignored_folder in self.ignored_folders):
95
- if any(file_path.name.endswith(f".{ext}") for ext in self.extensions):
96
- file_event = self.make_event(
97
- {"path": str(file_path)}, "FILESYSTEM", tags=["parsed_folder", "file"], parent=event
98
- )
99
- await self.emit_event(file_event)
100
- elif "file" in event.tags:
101
- file_path = event.data["path"]
102
- content = await self.scan.helpers.run_in_executor_mp(extract_text, file_path)
103
- if content:
104
- raw_text_event = self.make_event(
105
- content,
106
- "RAW_TEXT",
107
- context=f"Extracted text from {file_path}",
108
- parent=event,
109
- )
110
- await self.emit_event(raw_text_event)
111
-
112
- async def finish(self):
113
- del os.environ["SCARF_NO_ANALYTICS"]
114
- return
115
-
116
-
117
- def extract_text(file_path):
118
- """
119
- extract_text Extracts plaintext from a document path using unstructured.
120
-
121
- :param file_path: The path of the file to extract text from.
122
- :return: ASCII-encoded plaintext extracted from the document.
123
- """
124
-
125
- from unstructured.partition.auto import partition
126
-
127
- unstructured_file_types = [
128
- ".csv",
129
- ".eml",
130
- ".msg",
131
- ".epub",
132
- ".xlsx",
133
- ".xls",
134
- ".html",
135
- ".htm",
136
- ".md",
137
- ".org",
138
- ".odt",
139
- ".pdf",
140
- ".txt",
141
- ".text",
142
- ".log",
143
- ".ppt",
144
- ".pptx",
145
- ".rst",
146
- ".rtf",
147
- ".tsv",
148
- ".doc",
149
- ".docx",
150
- ".xml",
151
- ]
152
-
153
- # If the file can be extracted with unstructured use its partition function or try and read it
154
- if any(file_path.lower().endswith(file_type) for file_type in unstructured_file_types):
155
- try:
156
- elements = partition(filename=file_path)
157
- return "\n\n".join(element.text for element in elements)
158
- except ValueError:
159
- with open(file_path, "rb") as file:
160
- return file.read().decode("utf-8", errors="ignore")
161
- else:
162
- with open(file_path, "rb") as file:
163
- return file.read().decode("utf-8", errors="ignore")
@@ -1,102 +0,0 @@
1
- from pathlib import Path
2
- from .base import ModuleTestBase
3
-
4
-
5
- class TestUnstructured(ModuleTestBase):
6
- targets = ["http://127.0.0.1:8888"]
7
- modules_overrides = ["unstructured", "filedownload", "httpx", "excavate", "speculate"]
8
- config_overrides = {"web": {"spider_distance": 2, "spider_depth": 2}}
9
-
10
- pdf_data = r"""%PDF-1.3
11
- %���� ReportLab Generated PDF document http://www.reportlab.com
12
- 1 0 obj
13
- <<
14
- /F1 2 0 R
15
- >>
16
- endobj
17
- 2 0 obj
18
- <<
19
- /BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
20
- >>
21
- endobj
22
- 3 0 obj
23
- <<
24
- /Contents 7 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 6 0 R /Resources <<
25
- /Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
26
- >> /Rotate 0 /Trans <<
27
-
28
- >>
29
- /Type /Page
30
- >>
31
- endobj
32
- 4 0 obj
33
- <<
34
- /PageMode /UseNone /Pages 6 0 R /Type /Catalog
35
- >>
36
- endobj
37
- 5 0 obj
38
- <<
39
- /Author (anonymous) /CreationDate (D:20240603185816+00'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20240603185816+00'00') /Producer (ReportLab PDF Library - www.reportlab.com)
40
- /Subject (unspecified) /Title (untitled) /Trapped /False
41
- >>
42
- endobj
43
- 6 0 obj
44
- <<
45
- /Count 1 /Kids [ 3 0 R ] /Type /Pages
46
- >>
47
- endobj
48
- 7 0 obj
49
- <<
50
- /Filter [ /ASCII85Decode /FlateDecode ] /Length 107
51
- >>
52
- stream
53
- GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_?CW4KISi90MjG^2,FS#<RC5+c,n)Z;$bK$b"5I[<!^TD#gi]&=5X,[5@Y@V~>endstream
54
- endobj
55
- xref
56
- 0 8
57
- 0000000000 65535 f
58
- 0000000073 00000 n
59
- 0000000104 00000 n
60
- 0000000211 00000 n
61
- 0000000414 00000 n
62
- 0000000482 00000 n
63
- 0000000778 00000 n
64
- 0000000837 00000 n
65
- trailer
66
- <<
67
- /ID
68
- [<80d9f5b964fc99284501deb7a6a637f7><80d9f5b964fc99284501deb7a6a637f7>]
69
- % ReportLab generated PDF document -- digest (http://www.reportlab.com)
70
-
71
- /Info 5 0 R
72
- /Root 4 0 R
73
- /Size 8
74
- >>
75
- startxref
76
- 1034
77
- %%EOF"""
78
-
79
- unstructured_response = "Hello, World!"
80
-
81
- async def setup_after_prep(self, module_test):
82
- module_test.set_expect_requests(
83
- dict(uri="/"),
84
- dict(response_data='<a href="/Test_PDF"/>'),
85
- )
86
- module_test.set_expect_requests(
87
- dict(uri="/Test_PDF"),
88
- dict(response_data=self.pdf_data, headers={"Content-Type": "application/pdf"}),
89
- )
90
-
91
- def check(self, module_test, events):
92
- filesystem_events = [e for e in events if e.type == "FILESYSTEM"]
93
- assert 1 == len(filesystem_events), filesystem_events
94
- filesystem_event = filesystem_events[0]
95
- file = Path(filesystem_event.data["path"])
96
- assert file.is_file(), "Destination file doesn't exist"
97
- assert open(file).read() == self.pdf_data, f"File at {file} does not contain the correct content"
98
- raw_text_events = [e for e in events if e.type == "RAW_TEXT"]
99
- assert 1 == len(raw_text_events), "Failed to emit RAW_TEXT event"
100
- assert (
101
- raw_text_events[0].data == self.unstructured_response
102
- ), f"Text extracted from PDF is incorrect, got {raw_text_events[0].data}"