classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""HTML to text content extraction utilities for detector scanning."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def html_to_text(html: str, preserve_structure: bool = True) -> str:
|
|
11
|
+
"""
|
|
12
|
+
Extract plain text from HTML for detector scanning.
|
|
13
|
+
|
|
14
|
+
Removes scripts, styles, and other non-content elements while preserving
|
|
15
|
+
the text content. Optionally preserves document structure with newlines.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
html: HTML content to extract text from
|
|
19
|
+
preserve_structure: Keep newlines for document structure (default: True)
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Clean plain text extracted from HTML
|
|
23
|
+
|
|
24
|
+
Examples:
|
|
25
|
+
>>> html_to_text("<p>Hello <b>world</b></p>")
|
|
26
|
+
'Hello world'
|
|
27
|
+
|
|
28
|
+
>>> html_to_text("<h1>Title</h1><p>Text</p>")
|
|
29
|
+
'Title\\nText'
|
|
30
|
+
"""
|
|
31
|
+
if not html or not html.strip():
|
|
32
|
+
return ""
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
import re
|
|
36
|
+
|
|
37
|
+
# Parse HTML using lxml for speed
|
|
38
|
+
soup = BeautifulSoup(html, "lxml")
|
|
39
|
+
|
|
40
|
+
# Remove script, style, noscript elements
|
|
41
|
+
for element in soup(["script", "style", "noscript"]):
|
|
42
|
+
element.decompose()
|
|
43
|
+
|
|
44
|
+
if preserve_structure:
|
|
45
|
+
# Add newlines after block-level elements for structure preservation
|
|
46
|
+
block_elements = [
|
|
47
|
+
"p",
|
|
48
|
+
"div",
|
|
49
|
+
"h1",
|
|
50
|
+
"h2",
|
|
51
|
+
"h3",
|
|
52
|
+
"h4",
|
|
53
|
+
"h5",
|
|
54
|
+
"h6",
|
|
55
|
+
"li",
|
|
56
|
+
"tr",
|
|
57
|
+
"br",
|
|
58
|
+
"hr",
|
|
59
|
+
]
|
|
60
|
+
for tag in soup.find_all(block_elements):
|
|
61
|
+
# Insert a newline after each block element
|
|
62
|
+
tag.append("\n")
|
|
63
|
+
|
|
64
|
+
# Get text
|
|
65
|
+
text = soup.get_text(separator=" ")
|
|
66
|
+
|
|
67
|
+
# Clean up whitespace
|
|
68
|
+
if preserve_structure:
|
|
69
|
+
# Normalize whitespace within lines
|
|
70
|
+
text = re.sub(r"[ \t]+", " ", text)
|
|
71
|
+
# Remove leading/trailing whitespace per line
|
|
72
|
+
lines = [line.strip() for line in text.split("\n")]
|
|
73
|
+
# Remove empty lines and join
|
|
74
|
+
text = "\n".join(line for line in lines if line)
|
|
75
|
+
else:
|
|
76
|
+
# For non-structure mode, just collapse all whitespace
|
|
77
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
78
|
+
|
|
79
|
+
return text
|
|
80
|
+
|
|
81
|
+
except Exception as e:
|
|
82
|
+
logger.error(f"Failed to parse HTML: {e}")
|
|
83
|
+
# Fallback: return HTML as-is (detectors will still work)
|
|
84
|
+
return html
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def strip_html_tags(html: str) -> str:
|
|
88
|
+
"""
|
|
89
|
+
Simple tag removal without parsing (faster but less accurate).
|
|
90
|
+
|
|
91
|
+
This is a lightweight alternative to html_to_text that uses regex
|
|
92
|
+
to strip tags. It doesn't handle entities, nested structures, or
|
|
93
|
+
script/style removal. Use html_to_text for better results.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
html: HTML content to strip tags from
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Text with HTML tags removed
|
|
100
|
+
|
|
101
|
+
Examples:
|
|
102
|
+
>>> strip_html_tags("<p>Hello <b>world</b></p>")
|
|
103
|
+
'Hello world'
|
|
104
|
+
"""
|
|
105
|
+
import re
|
|
106
|
+
|
|
107
|
+
clean = re.compile("<.*?>")
|
|
108
|
+
return re.sub(clean, "", html)
|