classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,108 @@
1
+ """HTML to text content extraction utilities for detector scanning."""
2
+
3
+ import logging
4
+
5
+ from bs4 import BeautifulSoup
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ def html_to_text(html: str, preserve_structure: bool = True) -> str:
11
+ """
12
+ Extract plain text from HTML for detector scanning.
13
+
14
+ Removes scripts, styles, and other non-content elements while preserving
15
+ the text content. Optionally preserves document structure with newlines.
16
+
17
+ Args:
18
+ html: HTML content to extract text from
19
+ preserve_structure: Keep newlines for document structure (default: True)
20
+
21
+ Returns:
22
+ Clean plain text extracted from HTML
23
+
24
+ Examples:
25
+ >>> html_to_text("<p>Hello <b>world</b></p>")
26
+ 'Hello world'
27
+
28
+ >>> html_to_text("<h1>Title</h1><p>Text</p>")
29
+ 'Title\\nText'
30
+ """
31
+ if not html or not html.strip():
32
+ return ""
33
+
34
+ try:
35
+ import re
36
+
37
+ # Parse HTML using lxml for speed
38
+ soup = BeautifulSoup(html, "lxml")
39
+
40
+ # Remove script, style, noscript elements
41
+ for element in soup(["script", "style", "noscript"]):
42
+ element.decompose()
43
+
44
+ if preserve_structure:
45
+ # Add newlines after block-level elements for structure preservation
46
+ block_elements = [
47
+ "p",
48
+ "div",
49
+ "h1",
50
+ "h2",
51
+ "h3",
52
+ "h4",
53
+ "h5",
54
+ "h6",
55
+ "li",
56
+ "tr",
57
+ "br",
58
+ "hr",
59
+ ]
60
+ for tag in soup.find_all(block_elements):
61
+ # Insert a newline after each block element
62
+ tag.append("\n")
63
+
64
+ # Get text
65
+ text = soup.get_text(separator=" ")
66
+
67
+ # Clean up whitespace
68
+ if preserve_structure:
69
+ # Normalize whitespace within lines
70
+ text = re.sub(r"[ \t]+", " ", text)
71
+ # Remove leading/trailing whitespace per line
72
+ lines = [line.strip() for line in text.split("\n")]
73
+ # Remove empty lines and join
74
+ text = "\n".join(line for line in lines if line)
75
+ else:
76
+ # For non-structure mode, just collapse all whitespace
77
+ text = re.sub(r"\s+", " ", text).strip()
78
+
79
+ return text
80
+
81
+ except Exception as e:
82
+ logger.error(f"Failed to parse HTML: {e}")
83
+ # Fallback: return HTML as-is (detectors will still work)
84
+ return html
85
+
86
+
87
+ def strip_html_tags(html: str) -> str:
88
+ """
89
+ Simple tag removal without parsing (faster but less accurate).
90
+
91
+ This is a lightweight alternative to html_to_text that uses regex
92
+ to strip tags. It doesn't handle entities, nested structures, or
93
+ script/style removal. Use html_to_text for better results.
94
+
95
+ Args:
96
+ html: HTML content to strip tags from
97
+
98
+ Returns:
99
+ Text with HTML tags removed
100
+
101
+ Examples:
102
+ >>> strip_html_tags("<p>Hello <b>world</b></p>")
103
+ 'Hello world'
104
+ """
105
+ import re
106
+
107
+ clean = re.compile("<.*?>")
108
+ return re.sub(clean, "", html)