bbot 2.0.1.4654rc0__py3-none-any.whl → 2.3.0.5397rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bbot might be problematic. Click here for more details.
- bbot/__init__.py +1 -1
- bbot/cli.py +3 -7
- bbot/core/config/files.py +0 -1
- bbot/core/config/logger.py +34 -4
- bbot/core/core.py +21 -6
- bbot/core/engine.py +9 -8
- bbot/core/event/base.py +162 -63
- bbot/core/helpers/bloom.py +10 -3
- bbot/core/helpers/command.py +9 -8
- bbot/core/helpers/depsinstaller/installer.py +89 -32
- bbot/core/helpers/depsinstaller/sudo_askpass.py +38 -2
- bbot/core/helpers/diff.py +10 -10
- bbot/core/helpers/dns/brute.py +18 -14
- bbot/core/helpers/dns/dns.py +16 -15
- bbot/core/helpers/dns/engine.py +159 -132
- bbot/core/helpers/dns/helpers.py +2 -2
- bbot/core/helpers/dns/mock.py +26 -8
- bbot/core/helpers/files.py +1 -1
- bbot/core/helpers/helper.py +7 -4
- bbot/core/helpers/interactsh.py +3 -3
- bbot/core/helpers/libmagic.py +65 -0
- bbot/core/helpers/misc.py +65 -22
- bbot/core/helpers/names_generator.py +17 -3
- bbot/core/helpers/process.py +0 -20
- bbot/core/helpers/regex.py +1 -1
- bbot/core/helpers/regexes.py +12 -6
- bbot/core/helpers/validators.py +1 -2
- bbot/core/helpers/web/client.py +1 -1
- bbot/core/helpers/web/engine.py +18 -13
- bbot/core/helpers/web/web.py +25 -116
- bbot/core/helpers/wordcloud.py +5 -5
- bbot/core/modules.py +36 -27
- bbot/core/multiprocess.py +58 -0
- bbot/core/shared_deps.py +46 -3
- bbot/db/sql/models.py +147 -0
- bbot/defaults.yml +15 -10
- bbot/errors.py +0 -8
- bbot/modules/anubisdb.py +2 -2
- bbot/modules/apkpure.py +63 -0
- bbot/modules/azure_tenant.py +2 -2
- bbot/modules/baddns.py +35 -19
- bbot/modules/baddns_direct.py +92 -0
- bbot/modules/baddns_zone.py +3 -8
- bbot/modules/badsecrets.py +4 -3
- bbot/modules/base.py +195 -51
- bbot/modules/bevigil.py +7 -7
- bbot/modules/binaryedge.py +7 -4
- bbot/modules/bufferoverrun.py +47 -0
- bbot/modules/builtwith.py +6 -10
- bbot/modules/bypass403.py +5 -5
- bbot/modules/c99.py +10 -7
- bbot/modules/censys.py +9 -13
- bbot/modules/certspotter.py +5 -3
- bbot/modules/chaos.py +9 -7
- bbot/modules/code_repository.py +1 -0
- bbot/modules/columbus.py +3 -3
- bbot/modules/crt.py +5 -3
- bbot/modules/deadly/dastardly.py +1 -1
- bbot/modules/deadly/ffuf.py +9 -9
- bbot/modules/deadly/nuclei.py +3 -3
- bbot/modules/deadly/vhost.py +4 -3
- bbot/modules/dehashed.py +1 -1
- bbot/modules/digitorus.py +1 -1
- bbot/modules/dnsbimi.py +145 -0
- bbot/modules/dnscaa.py +3 -3
- bbot/modules/dnsdumpster.py +4 -4
- bbot/modules/dnstlsrpt.py +144 -0
- bbot/modules/docker_pull.py +7 -5
- bbot/modules/dockerhub.py +2 -2
- bbot/modules/dotnetnuke.py +18 -19
- bbot/modules/emailformat.py +1 -1
- bbot/modules/extractous.py +122 -0
- bbot/modules/filedownload.py +9 -7
- bbot/modules/fullhunt.py +7 -4
- bbot/modules/generic_ssrf.py +5 -5
- bbot/modules/github_codesearch.py +3 -2
- bbot/modules/github_org.py +4 -4
- bbot/modules/github_workflows.py +4 -4
- bbot/modules/gitlab.py +2 -5
- bbot/modules/google_playstore.py +93 -0
- bbot/modules/gowitness.py +48 -50
- bbot/modules/hackertarget.py +5 -3
- bbot/modules/host_header.py +5 -5
- bbot/modules/httpx.py +1 -4
- bbot/modules/hunterio.py +3 -9
- bbot/modules/iis_shortnames.py +19 -30
- bbot/modules/internal/cloudcheck.py +27 -12
- bbot/modules/internal/dnsresolve.py +250 -276
- bbot/modules/internal/excavate.py +100 -64
- bbot/modules/internal/speculate.py +42 -33
- bbot/modules/internetdb.py +4 -2
- bbot/modules/ip2location.py +3 -5
- bbot/modules/ipneighbor.py +1 -1
- bbot/modules/ipstack.py +3 -8
- bbot/modules/jadx.py +87 -0
- bbot/modules/leakix.py +11 -10
- bbot/modules/myssl.py +2 -2
- bbot/modules/newsletters.py +2 -2
- bbot/modules/otx.py +5 -3
- bbot/modules/output/asset_inventory.py +7 -7
- bbot/modules/output/base.py +1 -1
- bbot/modules/output/csv.py +1 -2
- bbot/modules/output/http.py +20 -14
- bbot/modules/output/mysql.py +51 -0
- bbot/modules/output/neo4j.py +7 -2
- bbot/modules/output/postgres.py +49 -0
- bbot/modules/output/slack.py +0 -1
- bbot/modules/output/sqlite.py +29 -0
- bbot/modules/output/stdout.py +2 -2
- bbot/modules/output/teams.py +107 -6
- bbot/modules/paramminer_headers.py +5 -8
- bbot/modules/passivetotal.py +13 -13
- bbot/modules/portscan.py +32 -6
- bbot/modules/postman.py +50 -126
- bbot/modules/postman_download.py +220 -0
- bbot/modules/rapiddns.py +3 -8
- bbot/modules/report/asn.py +11 -11
- bbot/modules/robots.py +3 -3
- bbot/modules/securitytrails.py +7 -10
- bbot/modules/securitytxt.py +128 -0
- bbot/modules/shodan_dns.py +7 -9
- bbot/modules/sitedossier.py +1 -1
- bbot/modules/skymem.py +2 -2
- bbot/modules/social.py +2 -1
- bbot/modules/subdomaincenter.py +1 -1
- bbot/modules/subdomainradar.py +160 -0
- bbot/modules/telerik.py +8 -8
- bbot/modules/templates/bucket.py +1 -1
- bbot/modules/templates/github.py +22 -14
- bbot/modules/templates/postman.py +21 -0
- bbot/modules/templates/shodan.py +14 -13
- bbot/modules/templates/sql.py +95 -0
- bbot/modules/templates/subdomain_enum.py +53 -17
- bbot/modules/templates/webhook.py +2 -4
- bbot/modules/trickest.py +8 -37
- bbot/modules/trufflehog.py +18 -3
- bbot/modules/url_manipulation.py +3 -3
- bbot/modules/urlscan.py +1 -1
- bbot/modules/viewdns.py +1 -1
- bbot/modules/virustotal.py +8 -30
- bbot/modules/wafw00f.py +1 -1
- bbot/modules/wayback.py +1 -1
- bbot/modules/wpscan.py +17 -11
- bbot/modules/zoomeye.py +11 -6
- bbot/presets/baddns-thorough.yml +12 -0
- bbot/presets/fast.yml +16 -0
- bbot/presets/kitchen-sink.yml +1 -0
- bbot/presets/spider.yml +4 -0
- bbot/presets/subdomain-enum.yml +7 -7
- bbot/scanner/manager.py +5 -16
- bbot/scanner/preset/args.py +44 -26
- bbot/scanner/preset/environ.py +7 -2
- bbot/scanner/preset/path.py +7 -4
- bbot/scanner/preset/preset.py +36 -23
- bbot/scanner/scanner.py +176 -63
- bbot/scanner/target.py +236 -434
- bbot/scripts/docs.py +1 -1
- bbot/test/bbot_fixtures.py +22 -3
- bbot/test/conftest.py +132 -100
- bbot/test/fastapi_test.py +17 -0
- bbot/test/owasp_mastg.apk +0 -0
- bbot/test/run_tests.sh +4 -4
- bbot/test/test.conf +2 -0
- bbot/test/test_step_1/test_bbot_fastapi.py +82 -0
- bbot/test/test_step_1/test_bloom_filter.py +2 -0
- bbot/test/test_step_1/test_cli.py +138 -64
- bbot/test/test_step_1/test_dns.py +392 -70
- bbot/test/test_step_1/test_engine.py +17 -17
- bbot/test/test_step_1/test_events.py +203 -37
- bbot/test/test_step_1/test_helpers.py +64 -28
- bbot/test/test_step_1/test_manager_deduplication.py +1 -1
- bbot/test/test_step_1/test_manager_scope_accuracy.py +336 -338
- bbot/test/test_step_1/test_modules_basic.py +69 -71
- bbot/test/test_step_1/test_presets.py +184 -96
- bbot/test/test_step_1/test_python_api.py +7 -2
- bbot/test/test_step_1/test_regexes.py +35 -5
- bbot/test/test_step_1/test_scan.py +39 -5
- bbot/test/test_step_1/test_scope.py +5 -4
- bbot/test/test_step_1/test_target.py +243 -145
- bbot/test/test_step_1/test_web.py +48 -10
- bbot/test/test_step_2/module_tests/base.py +17 -20
- bbot/test/test_step_2/module_tests/test_module_anubisdb.py +1 -1
- bbot/test/test_step_2/module_tests/test_module_apkpure.py +71 -0
- bbot/test/test_step_2/module_tests/test_module_asset_inventory.py +0 -1
- bbot/test/test_step_2/module_tests/test_module_azure_realm.py +1 -1
- bbot/test/test_step_2/module_tests/test_module_baddns.py +6 -6
- bbot/test/test_step_2/module_tests/test_module_baddns_direct.py +62 -0
- bbot/test/test_step_2/module_tests/test_module_bevigil.py +29 -2
- bbot/test/test_step_2/module_tests/test_module_binaryedge.py +4 -2
- bbot/test/test_step_2/module_tests/test_module_bucket_amazon.py +2 -2
- bbot/test/test_step_2/module_tests/test_module_bucket_azure.py +1 -1
- bbot/test/test_step_2/module_tests/test_module_bufferoverrun.py +35 -0
- bbot/test/test_step_2/module_tests/test_module_builtwith.py +2 -2
- bbot/test/test_step_2/module_tests/test_module_bypass403.py +1 -1
- bbot/test/test_step_2/module_tests/test_module_c99.py +126 -0
- bbot/test/test_step_2/module_tests/test_module_censys.py +4 -1
- bbot/test/test_step_2/module_tests/test_module_cloudcheck.py +4 -0
- bbot/test/test_step_2/module_tests/test_module_code_repository.py +11 -1
- bbot/test/test_step_2/module_tests/test_module_columbus.py +1 -1
- bbot/test/test_step_2/module_tests/test_module_credshed.py +3 -3
- bbot/test/test_step_2/module_tests/test_module_dastardly.py +2 -1
- bbot/test/test_step_2/module_tests/test_module_dehashed.py +2 -2
- bbot/test/test_step_2/module_tests/test_module_digitorus.py +1 -1
- bbot/test/test_step_2/module_tests/test_module_discord.py +1 -1
- bbot/test/test_step_2/module_tests/test_module_dnsbimi.py +103 -0
- bbot/test/test_step_2/module_tests/test_module_dnsbrute.py +9 -10
- bbot/test/test_step_2/module_tests/test_module_dnsbrute_mutations.py +1 -2
- bbot/test/test_step_2/module_tests/test_module_dnscommonsrv.py +1 -2
- bbot/test/test_step_2/module_tests/test_module_dnsdumpster.py +4 -4
- bbot/test/test_step_2/module_tests/test_module_dnstlsrpt.py +64 -0
- bbot/test/test_step_2/module_tests/test_module_dotnetnuke.py +0 -8
- bbot/test/test_step_2/module_tests/test_module_excavate.py +17 -37
- bbot/test/test_step_2/module_tests/test_module_extractous.py +54 -0
- bbot/test/test_step_2/module_tests/test_module_ffuf_shortnames.py +1 -1
- bbot/test/test_step_2/module_tests/test_module_filedownload.py +14 -14
- bbot/test/test_step_2/module_tests/test_module_git_clone.py +2 -2
- bbot/test/test_step_2/module_tests/test_module_github_org.py +19 -8
- bbot/test/test_step_2/module_tests/test_module_github_workflows.py +1 -1
- bbot/test/test_step_2/module_tests/test_module_gitlab.py +9 -4
- bbot/test/test_step_2/module_tests/test_module_google_playstore.py +83 -0
- bbot/test/test_step_2/module_tests/test_module_gowitness.py +4 -4
- bbot/test/test_step_2/module_tests/test_module_host_header.py +1 -1
- bbot/test/test_step_2/module_tests/test_module_http.py +4 -4
- bbot/test/test_step_2/module_tests/test_module_httpx.py +10 -8
- bbot/test/test_step_2/module_tests/test_module_hunterio.py +68 -4
- bbot/test/test_step_2/module_tests/test_module_jadx.py +55 -0
- bbot/test/test_step_2/module_tests/test_module_json.py +24 -11
- bbot/test/test_step_2/module_tests/test_module_leakix.py +7 -3
- bbot/test/test_step_2/module_tests/test_module_mysql.py +76 -0
- bbot/test/test_step_2/module_tests/test_module_myssl.py +1 -1
- bbot/test/test_step_2/module_tests/test_module_neo4j.py +1 -1
- bbot/test/test_step_2/module_tests/test_module_newsletters.py +6 -6
- bbot/test/test_step_2/module_tests/test_module_ntlm.py +7 -7
- bbot/test/test_step_2/module_tests/test_module_oauth.py +1 -1
- bbot/test/test_step_2/module_tests/test_module_otx.py +1 -1
- bbot/test/test_step_2/module_tests/test_module_paramminer_cookies.py +1 -2
- bbot/test/test_step_2/module_tests/test_module_paramminer_getparams.py +0 -6
- bbot/test/test_step_2/module_tests/test_module_paramminer_headers.py +2 -9
- bbot/test/test_step_2/module_tests/test_module_passivetotal.py +3 -1
- bbot/test/test_step_2/module_tests/test_module_portscan.py +9 -8
- bbot/test/test_step_2/module_tests/test_module_postgres.py +74 -0
- bbot/test/test_step_2/module_tests/test_module_postman.py +84 -253
- bbot/test/test_step_2/module_tests/test_module_postman_download.py +439 -0
- bbot/test/test_step_2/module_tests/test_module_rapiddns.py +93 -1
- bbot/test/test_step_2/module_tests/test_module_securitytxt.py +50 -0
- bbot/test/test_step_2/module_tests/test_module_shodan_dns.py +20 -1
- bbot/test/test_step_2/module_tests/test_module_sitedossier.py +2 -2
- bbot/test/test_step_2/module_tests/test_module_smuggler.py +1 -1
- bbot/test/test_step_2/module_tests/test_module_social.py +11 -1
- bbot/test/test_step_2/module_tests/test_module_speculate.py +2 -6
- bbot/test/test_step_2/module_tests/test_module_splunk.py +4 -4
- bbot/test/test_step_2/module_tests/test_module_sqlite.py +18 -0
- bbot/test/test_step_2/module_tests/test_module_sslcert.py +1 -1
- bbot/test/test_step_2/module_tests/test_module_stdout.py +5 -3
- bbot/test/test_step_2/module_tests/test_module_subdomaincenter.py +1 -1
- bbot/test/test_step_2/module_tests/test_module_subdomainradar.py +208 -0
- bbot/test/test_step_2/module_tests/test_module_subdomains.py +1 -1
- bbot/test/test_step_2/module_tests/test_module_teams.py +8 -6
- bbot/test/test_step_2/module_tests/test_module_telerik.py +1 -1
- bbot/test/test_step_2/module_tests/test_module_trufflehog.py +317 -11
- bbot/test/test_step_2/module_tests/test_module_wayback.py +1 -1
- bbot/test/test_step_2/template_tests/test_template_subdomain_enum.py +135 -0
- {bbot-2.0.1.4654rc0.dist-info → bbot-2.3.0.5397rc0.dist-info}/METADATA +48 -18
- bbot-2.3.0.5397rc0.dist-info/RECORD +421 -0
- {bbot-2.0.1.4654rc0.dist-info → bbot-2.3.0.5397rc0.dist-info}/WHEEL +1 -1
- bbot/modules/unstructured.py +0 -163
- bbot/test/test_step_2/module_tests/test_module_unstructured.py +0 -102
- bbot-2.0.1.4654rc0.dist-info/RECORD +0 -385
- {bbot-2.0.1.4654rc0.dist-info → bbot-2.3.0.5397rc0.dist-info}/LICENSE +0 -0
- {bbot-2.0.1.4654rc0.dist-info → bbot-2.3.0.5397rc0.dist-info}/entry_points.txt +0 -0
bbot/modules/unstructured.py
DELETED
|
@@ -1,163 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
from bbot.modules.base import BaseModule
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class unstructured(BaseModule):
|
|
8
|
-
watched_events = ["FILESYSTEM"]
|
|
9
|
-
produced_events = ["FILESYSTEM", "RAW_TEXT"]
|
|
10
|
-
flags = ["passive", "safe"]
|
|
11
|
-
meta = {
|
|
12
|
-
"description": "Module to extract data from files",
|
|
13
|
-
"created_date": "2024-06-03",
|
|
14
|
-
"author": "@domwhewell-sage",
|
|
15
|
-
}
|
|
16
|
-
options = {
|
|
17
|
-
"extensions": [
|
|
18
|
-
"bak", # Backup File
|
|
19
|
-
"bash", # Bash Script or Configuration
|
|
20
|
-
"bashrc", # Bash Script or Configuration
|
|
21
|
-
"conf", # Configuration File
|
|
22
|
-
"cfg", # Configuration File
|
|
23
|
-
"crt", # Certificate File
|
|
24
|
-
"csv", # Comma Separated Values File
|
|
25
|
-
"db", # SQLite Database File
|
|
26
|
-
"sqlite", # SQLite Database File
|
|
27
|
-
"doc", # Microsoft Word Document (Old Format)
|
|
28
|
-
"docx", # Microsoft Word Document
|
|
29
|
-
"ica", # Citrix Independent Computing Architecture File
|
|
30
|
-
"indd", # Adobe InDesign Document
|
|
31
|
-
"ini", # Initialization File
|
|
32
|
-
"key", # Private Key File
|
|
33
|
-
"pub", # Public Key File
|
|
34
|
-
"log", # Log File
|
|
35
|
-
"markdown", # Markdown File
|
|
36
|
-
"md", # Markdown File
|
|
37
|
-
"odg", # OpenDocument Graphics (LibreOffice, OpenOffice)
|
|
38
|
-
"odp", # OpenDocument Presentation (LibreOffice, OpenOffice)
|
|
39
|
-
"ods", # OpenDocument Spreadsheet (LibreOffice, OpenOffice)
|
|
40
|
-
"odt", # OpenDocument Text (LibreOffice, OpenOffice)
|
|
41
|
-
"pdf", # Adobe Portable Document Format
|
|
42
|
-
"pem", # Privacy Enhanced Mail (SSL certificate)
|
|
43
|
-
"pps", # Microsoft PowerPoint Slideshow (Old Format)
|
|
44
|
-
"ppsx", # Microsoft PowerPoint Slideshow
|
|
45
|
-
"ppt", # Microsoft PowerPoint Presentation (Old Format)
|
|
46
|
-
"pptx", # Microsoft PowerPoint Presentation
|
|
47
|
-
"ps1", # PowerShell Script
|
|
48
|
-
"rdp", # Remote Desktop Protocol File
|
|
49
|
-
"sh", # Shell Script
|
|
50
|
-
"sql", # SQL Database Dump
|
|
51
|
-
"swp", # Swap File (temporary file, often Vim)
|
|
52
|
-
"sxw", # OpenOffice.org Writer document
|
|
53
|
-
"txt", # Plain Text Document
|
|
54
|
-
"vbs", # Visual Basic Script
|
|
55
|
-
"wpd", # WordPerfect Document
|
|
56
|
-
"xls", # Microsoft Excel Spreadsheet (Old Format)
|
|
57
|
-
"xlsx", # Microsoft Excel Spreadsheet
|
|
58
|
-
"xml", # eXtensible Markup Language File
|
|
59
|
-
"yml", # YAML Ain't Markup Language
|
|
60
|
-
"yaml", # YAML Ain't Markup Language
|
|
61
|
-
],
|
|
62
|
-
"ignore_folders": [".git"],
|
|
63
|
-
}
|
|
64
|
-
options_desc = {
|
|
65
|
-
"extensions": "File extensions to parse",
|
|
66
|
-
"ignore_folders": "Subfolders to ignore when crawling downloaded folders",
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
deps_apt = ["libmagic-dev", "poppler-utils", "tesseract-ocr", "libreoffice", "pandoc"]
|
|
70
|
-
deps_pip = ["unstructured[all-docs]>=0.15.7,<1.0", "nltk>=3.9.0,<4.0"]
|
|
71
|
-
|
|
72
|
-
scope_distance_modifier = 1
|
|
73
|
-
|
|
74
|
-
async def setup(self):
|
|
75
|
-
self.extensions = list(set([e.lower().strip(".") for e in self.config.get("extensions", [])]))
|
|
76
|
-
self.ignored_folders = self.config.get("ignore_folders", [])
|
|
77
|
-
# Do not send user statistics to the unstructured library
|
|
78
|
-
os.environ["SCARF_NO_ANALYTICS"] = "true"
|
|
79
|
-
return True
|
|
80
|
-
|
|
81
|
-
async def filter_event(self, event):
|
|
82
|
-
if "file" not in event.tags and "folder" not in event.tags:
|
|
83
|
-
return False, "Event is not a file or folder"
|
|
84
|
-
if "file" in event.tags:
|
|
85
|
-
if not any(event.data["path"].endswith(f".{ext}") for ext in self.extensions):
|
|
86
|
-
return False, "File extension not in the allowed list"
|
|
87
|
-
return True
|
|
88
|
-
|
|
89
|
-
async def handle_event(self, event):
|
|
90
|
-
if "folder" in event.tags:
|
|
91
|
-
folder_path = Path(event.data["path"])
|
|
92
|
-
for file_path in folder_path.rglob("*"):
|
|
93
|
-
# If the file is not in an ignored folder and if it has an allowed extension raise it as a FILESYSTEM event
|
|
94
|
-
if not any(ignored_folder in str(file_path) for ignored_folder in self.ignored_folders):
|
|
95
|
-
if any(file_path.name.endswith(f".{ext}") for ext in self.extensions):
|
|
96
|
-
file_event = self.make_event(
|
|
97
|
-
{"path": str(file_path)}, "FILESYSTEM", tags=["parsed_folder", "file"], parent=event
|
|
98
|
-
)
|
|
99
|
-
await self.emit_event(file_event)
|
|
100
|
-
elif "file" in event.tags:
|
|
101
|
-
file_path = event.data["path"]
|
|
102
|
-
content = await self.scan.helpers.run_in_executor_mp(extract_text, file_path)
|
|
103
|
-
if content:
|
|
104
|
-
raw_text_event = self.make_event(
|
|
105
|
-
content,
|
|
106
|
-
"RAW_TEXT",
|
|
107
|
-
context=f"Extracted text from {file_path}",
|
|
108
|
-
parent=event,
|
|
109
|
-
)
|
|
110
|
-
await self.emit_event(raw_text_event)
|
|
111
|
-
|
|
112
|
-
async def finish(self):
|
|
113
|
-
del os.environ["SCARF_NO_ANALYTICS"]
|
|
114
|
-
return
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
def extract_text(file_path):
|
|
118
|
-
"""
|
|
119
|
-
extract_text Extracts plaintext from a document path using unstructured.
|
|
120
|
-
|
|
121
|
-
:param file_path: The path of the file to extract text from.
|
|
122
|
-
:return: ASCII-encoded plaintext extracted from the document.
|
|
123
|
-
"""
|
|
124
|
-
|
|
125
|
-
from unstructured.partition.auto import partition
|
|
126
|
-
|
|
127
|
-
unstructured_file_types = [
|
|
128
|
-
".csv",
|
|
129
|
-
".eml",
|
|
130
|
-
".msg",
|
|
131
|
-
".epub",
|
|
132
|
-
".xlsx",
|
|
133
|
-
".xls",
|
|
134
|
-
".html",
|
|
135
|
-
".htm",
|
|
136
|
-
".md",
|
|
137
|
-
".org",
|
|
138
|
-
".odt",
|
|
139
|
-
".pdf",
|
|
140
|
-
".txt",
|
|
141
|
-
".text",
|
|
142
|
-
".log",
|
|
143
|
-
".ppt",
|
|
144
|
-
".pptx",
|
|
145
|
-
".rst",
|
|
146
|
-
".rtf",
|
|
147
|
-
".tsv",
|
|
148
|
-
".doc",
|
|
149
|
-
".docx",
|
|
150
|
-
".xml",
|
|
151
|
-
]
|
|
152
|
-
|
|
153
|
-
# If the file can be extracted with unstructured use its partition function or try and read it
|
|
154
|
-
if any(file_path.lower().endswith(file_type) for file_type in unstructured_file_types):
|
|
155
|
-
try:
|
|
156
|
-
elements = partition(filename=file_path)
|
|
157
|
-
return "\n\n".join(element.text for element in elements)
|
|
158
|
-
except ValueError:
|
|
159
|
-
with open(file_path, "rb") as file:
|
|
160
|
-
return file.read().decode("utf-8", errors="ignore")
|
|
161
|
-
else:
|
|
162
|
-
with open(file_path, "rb") as file:
|
|
163
|
-
return file.read().decode("utf-8", errors="ignore")
|
|
@@ -1,102 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
from .base import ModuleTestBase
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class TestUnstructured(ModuleTestBase):
|
|
6
|
-
targets = ["http://127.0.0.1:8888"]
|
|
7
|
-
modules_overrides = ["unstructured", "filedownload", "httpx", "excavate", "speculate"]
|
|
8
|
-
config_overrides = {"web": {"spider_distance": 2, "spider_depth": 2}}
|
|
9
|
-
|
|
10
|
-
pdf_data = r"""%PDF-1.3
|
|
11
|
-
%���� ReportLab Generated PDF document http://www.reportlab.com
|
|
12
|
-
1 0 obj
|
|
13
|
-
<<
|
|
14
|
-
/F1 2 0 R
|
|
15
|
-
>>
|
|
16
|
-
endobj
|
|
17
|
-
2 0 obj
|
|
18
|
-
<<
|
|
19
|
-
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
|
20
|
-
>>
|
|
21
|
-
endobj
|
|
22
|
-
3 0 obj
|
|
23
|
-
<<
|
|
24
|
-
/Contents 7 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 6 0 R /Resources <<
|
|
25
|
-
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
|
26
|
-
>> /Rotate 0 /Trans <<
|
|
27
|
-
|
|
28
|
-
>>
|
|
29
|
-
/Type /Page
|
|
30
|
-
>>
|
|
31
|
-
endobj
|
|
32
|
-
4 0 obj
|
|
33
|
-
<<
|
|
34
|
-
/PageMode /UseNone /Pages 6 0 R /Type /Catalog
|
|
35
|
-
>>
|
|
36
|
-
endobj
|
|
37
|
-
5 0 obj
|
|
38
|
-
<<
|
|
39
|
-
/Author (anonymous) /CreationDate (D:20240603185816+00'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20240603185816+00'00') /Producer (ReportLab PDF Library - www.reportlab.com)
|
|
40
|
-
/Subject (unspecified) /Title (untitled) /Trapped /False
|
|
41
|
-
>>
|
|
42
|
-
endobj
|
|
43
|
-
6 0 obj
|
|
44
|
-
<<
|
|
45
|
-
/Count 1 /Kids [ 3 0 R ] /Type /Pages
|
|
46
|
-
>>
|
|
47
|
-
endobj
|
|
48
|
-
7 0 obj
|
|
49
|
-
<<
|
|
50
|
-
/Filter [ /ASCII85Decode /FlateDecode ] /Length 107
|
|
51
|
-
>>
|
|
52
|
-
stream
|
|
53
|
-
GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_?CW4KISi90MjG^2,FS#<RC5+c,n)Z;$bK$b"5I[<!^TD#gi]&=5X,[5@Y@V~>endstream
|
|
54
|
-
endobj
|
|
55
|
-
xref
|
|
56
|
-
0 8
|
|
57
|
-
0000000000 65535 f
|
|
58
|
-
0000000073 00000 n
|
|
59
|
-
0000000104 00000 n
|
|
60
|
-
0000000211 00000 n
|
|
61
|
-
0000000414 00000 n
|
|
62
|
-
0000000482 00000 n
|
|
63
|
-
0000000778 00000 n
|
|
64
|
-
0000000837 00000 n
|
|
65
|
-
trailer
|
|
66
|
-
<<
|
|
67
|
-
/ID
|
|
68
|
-
[<80d9f5b964fc99284501deb7a6a637f7><80d9f5b964fc99284501deb7a6a637f7>]
|
|
69
|
-
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
|
|
70
|
-
|
|
71
|
-
/Info 5 0 R
|
|
72
|
-
/Root 4 0 R
|
|
73
|
-
/Size 8
|
|
74
|
-
>>
|
|
75
|
-
startxref
|
|
76
|
-
1034
|
|
77
|
-
%%EOF"""
|
|
78
|
-
|
|
79
|
-
unstructured_response = "Hello, World!"
|
|
80
|
-
|
|
81
|
-
async def setup_after_prep(self, module_test):
|
|
82
|
-
module_test.set_expect_requests(
|
|
83
|
-
dict(uri="/"),
|
|
84
|
-
dict(response_data='<a href="/Test_PDF"/>'),
|
|
85
|
-
)
|
|
86
|
-
module_test.set_expect_requests(
|
|
87
|
-
dict(uri="/Test_PDF"),
|
|
88
|
-
dict(response_data=self.pdf_data, headers={"Content-Type": "application/pdf"}),
|
|
89
|
-
)
|
|
90
|
-
|
|
91
|
-
def check(self, module_test, events):
|
|
92
|
-
filesystem_events = [e for e in events if e.type == "FILESYSTEM"]
|
|
93
|
-
assert 1 == len(filesystem_events), filesystem_events
|
|
94
|
-
filesystem_event = filesystem_events[0]
|
|
95
|
-
file = Path(filesystem_event.data["path"])
|
|
96
|
-
assert file.is_file(), "Destination file doesn't exist"
|
|
97
|
-
assert open(file).read() == self.pdf_data, f"File at {file} does not contain the correct content"
|
|
98
|
-
raw_text_events = [e for e in events if e.type == "RAW_TEXT"]
|
|
99
|
-
assert 1 == len(raw_text_events), "Failed to emit RAW_TEXT event"
|
|
100
|
-
assert (
|
|
101
|
-
raw_text_events[0].data == self.unstructured_response
|
|
102
|
-
), f"Text extracted from PDF is incorrect, got {raw_text_events[0].data}"
|