aiwaf 0.1.9.2.1__tar.gz → 0.1.9.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiwaf might be problematic. Click here for more details.

Files changed (42) hide show
  1. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/PKG-INFO +1 -1
  2. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/__init__.py +1 -1
  3. aiwaf-0.1.9.2.3/aiwaf/management/commands/aiwaf_list.py +81 -0
  4. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/middleware.py +64 -6
  5. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/trainer.py +169 -34
  6. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf.egg-info/PKG-INFO +1 -1
  7. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf.egg-info/SOURCES.txt +1 -0
  8. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/pyproject.toml +1 -1
  9. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/setup.py +1 -1
  10. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/LICENSE +0 -0
  11. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/README.md +0 -0
  12. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/apps.py +0 -0
  13. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/blacklist_manager.py +0 -0
  14. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/decorators.py +0 -0
  15. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/management/__init__.py +0 -0
  16. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/management/commands/__init__.py +0 -0
  17. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/management/commands/add_exemption.py +0 -0
  18. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/management/commands/add_ipexemption.py +0 -0
  19. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/management/commands/aiwaf_diagnose.py +0 -0
  20. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/management/commands/aiwaf_logging.py +0 -0
  21. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/management/commands/aiwaf_reset.py +0 -0
  22. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/management/commands/check_dependencies.py +0 -0
  23. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/management/commands/clear_blacklist.py +0 -0
  24. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/management/commands/clear_cache.py +0 -0
  25. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/management/commands/debug_csv.py +0 -0
  26. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/management/commands/detect_and_train.py +0 -0
  27. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/management/commands/diagnose_blocking.py +0 -0
  28. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/management/commands/regenerate_model.py +0 -0
  29. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/management/commands/setup_models.py +0 -0
  30. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/management/commands/test_exemption.py +0 -0
  31. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/management/commands/test_exemption_fix.py +0 -0
  32. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/middleware_logger.py +0 -0
  33. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/models.py +0 -0
  34. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/resources/model.pkl +0 -0
  35. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/storage.py +0 -0
  36. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/templatetags/__init__.py +0 -0
  37. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/templatetags/aiwaf_tags.py +0 -0
  38. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf/utils.py +0 -0
  39. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf.egg-info/dependency_links.txt +0 -0
  40. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf.egg-info/requires.txt +0 -0
  41. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/aiwaf.egg-info/top_level.txt +0 -0
  42. {aiwaf-0.1.9.2.1 → aiwaf-0.1.9.2.3}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aiwaf
3
- Version: 0.1.9.2.1
3
+ Version: 0.1.9.2.3
4
4
  Summary: AI-powered Web Application Firewall
5
5
  Home-page: https://github.com/aayushgauba/aiwaf
6
6
  Author: Aayush Gauba
@@ -1,6 +1,6 @@
1
1
  default_app_config = "aiwaf.apps.AiwafConfig"
2
2
 
3
- __version__ = "0.1.9.2.1"
3
+ __version__ = "0.1.9.2.3"
4
4
 
5
5
  # Note: Middleware classes are available from aiwaf.middleware
6
6
  # Import them only when needed to avoid circular imports during Django app loading
@@ -0,0 +1,81 @@
1
+ from django.core.management.base import BaseCommand
2
+ from django.utils import timezone
3
+ from aiwaf.storage import get_blacklist_store, get_exemption_store, get_keyword_store
4
+ from datetime import timedelta
5
+ import json
6
+
7
+ def _sort(items, order):
8
+ reverse = (order == "newest")
9
+ return sorted(items, key=lambda x: x.get("created_at") or timezone.make_aware(timezone.datetime.min),
10
+ reverse=reverse)
11
+
12
+ def _filter_since(items, seconds):
13
+ if not seconds: return items
14
+ cutoff = timezone.now() - timedelta(seconds=seconds)
15
+ return [it for it in items if it.get("created_at") and it["created_at"] >= cutoff]
16
+
17
+ def _print_table(rows, headers):
18
+ widths = [len(h) for h in headers]
19
+ for r in rows:
20
+ for i, cell in enumerate(r):
21
+ widths[i] = max(widths[i], len(str(cell)))
22
+ print(" | ".join(h.ljust(widths[i]) for i, h in enumerate(headers)))
23
+ print("-+-".join("-" * w for w in widths))
24
+ for r in rows:
25
+ print(" | ".join(str(cell).ljust(widths[i]) for i, cell in enumerate(r)))
26
+
27
+ class Command(BaseCommand):
28
+ help = "Lister les données AIWAF (IPs bloquées, exemptions, mots-clés dynamiques)."
29
+
30
+ def add_arguments(self, parser):
31
+ grp = parser.add_mutually_exclusive_group()
32
+ grp.add_argument("--ips", action="store_true", help="Lister les IPs bloquées (défaut).")
33
+ grp.add_argument("--exemptions", action="store_true", help="Lister les IPs exemptées.")
34
+ grp.add_argument("--keywords", action="store_true", help="Lister les mots-clés dynamiques.")
35
+ grp.add_argument("--all", action="store_true", help="Tout lister.")
36
+ parser.add_argument("--format", choices=["table", "json"], default="table")
37
+ parser.add_argument("--limit", type=int, default=100)
38
+ parser.add_argument("--order", choices=["newest", "oldest"], default="newest")
39
+ parser.add_argument("--since", type=int, help="Fenêtre en secondes (ex: 86400 = 24h).")
40
+
41
+ def handle(self, *args, **o):
42
+ if not any([o["exemptions"], o["keywords"], o["all"]]): # défaut = ips
43
+ o["ips"] = True
44
+ payload = {}
45
+
46
+ if o["all"] or o["ips"]:
47
+ data = get_blacklist_store().get_all()
48
+ data = _filter_since(data, o.get("since"))
49
+ data = _sort(data, o["order"])[:o["limit"]]
50
+ payload["ips"] = data
51
+
52
+ if o["all"] or o["exemptions"]:
53
+ data = get_exemption_store().get_all()
54
+ data = _filter_since(data, o.get("since"))
55
+ data = _sort(data, o["order"])[:o["limit"]]
56
+ payload["exemptions"] = data
57
+
58
+ if o["all"] or o["keywords"]:
59
+ kws = get_keyword_store().get_top_keywords(o["limit"])
60
+ payload["keywords"] = [{"keyword": k} for k in kws]
61
+
62
+ if o["format"] == "json":
63
+ def _default(v):
64
+ try: return v.isoformat()
65
+ except Exception: return str(v)
66
+ self.stdout.write(json.dumps(payload, ensure_ascii=False, indent=2, default=_default))
67
+ else:
68
+ if "ips" in payload:
69
+ print("\n== IPs bloquées ==")
70
+ rows = [[r.get("ip_address",""), r.get("reason",""), r.get("created_at","")]
71
+ for r in payload["ips"]]
72
+ _print_table(rows, ["ip_address", "reason", "created_at"])
73
+ if "exemptions" in payload:
74
+ print("\n== Exemptions ==")
75
+ rows = [[r.get("ip_address",""), r.get("reason",""), r.get("created_at","")]
76
+ for r in payload["exemptions"]]
77
+ _print_table(rows, ["ip_address", "reason", "created_at"])
78
+ if "keywords" in payload:
79
+ print("\n== Mots-clés dynamiques ==")
80
+ rows = [[r["keyword"]] for r in payload["keywords"]]
81
+ _print_table(rows, ["keyword"])
@@ -298,11 +298,15 @@ class IPAndKeywordBlockMiddleware:
298
298
  keyword_store = get_keyword_store()
299
299
  segments = [seg for seg in re.split(r"\W+", path) if len(seg) > 3]
300
300
 
301
- # Only learn keywords from non-existent paths or suspicious contexts
302
- for seg in segments:
303
- if not path_exists or self._is_malicious_context(request, seg):
304
- keyword_store.add_keyword(seg)
305
-
301
+ # Smart learning: only learn from suspicious contexts, never from valid paths
302
+ if not path_exists: # Only learn from non-existent paths
303
+ for seg in segments:
304
+ # Only learn if it's not a legitimate keyword AND in a suspicious context
305
+ if (seg not in self.legitimate_path_keywords and
306
+ seg not in self.exempt_keywords and
307
+ self._is_malicious_context(request, seg)):
308
+ keyword_store.add_keyword(seg)
309
+
306
310
  dynamic_top = keyword_store.get_top_keywords(getattr(settings, "AIWAF_DYNAMIC_TOP_N", 10))
307
311
  all_kw = set(STATIC_KW) | set(dynamic_top)
308
312
 
@@ -345,7 +349,29 @@ class IPAndKeywordBlockMiddleware:
345
349
  block_reason = f"Inherently suspicious: {seg}"
346
350
 
347
351
  if is_suspicious:
348
- # Additional context check before blocking
352
+ # Additional context check before blocking - be more conservative with valid paths
353
+ if path_exists:
354
+ # For valid paths, only block if there are VERY strong malicious indicators
355
+ very_strong_indicators = [
356
+ # Multiple attack patterns in same request
357
+ sum([
358
+ '../' in request.path, '..\\' in request.path,
359
+ any(param in request.GET for param in ['cmd', 'exec', 'system']),
360
+ request.path.count('%') > 5, # Heavy URL encoding
361
+ len([s for s in segments if s in self.malicious_keywords]) > 2
362
+ ]) >= 2,
363
+
364
+ # Obvious attack attempts on valid paths
365
+ any(attack in request.path.lower() for attack in [
366
+ 'union+select', 'drop+table', '<script', 'javascript:',
367
+ 'onload=', 'onerror=', '${', '{{', 'eval('
368
+ ])
369
+ ]
370
+
371
+ if not any(very_strong_indicators):
372
+ continue # Skip blocking for valid paths without very strong indicators
373
+
374
+ # For non-existent paths or paths with very strong indicators, proceed with blocking
349
375
  if self._is_malicious_context(request, seg) or not path_exists:
350
376
  # Double-check exemption before blocking
351
377
  if not exemption_store.is_exempted(ip):
@@ -405,6 +431,38 @@ class AIAnomalyMiddleware(MiddlewareMixin):
405
431
  # Use the safely loaded global MODEL instead of loading again
406
432
  self.model = MODEL
407
433
 
434
+ def _is_malicious_context(self, request, keyword):
435
+ """
436
+ Determine if a keyword appears in a malicious context.
437
+ Only learn keywords when we have strong indicators of malicious intent.
438
+ """
439
+ # Don't learn from valid Django paths
440
+ if path_exists_in_django(request.path):
441
+ return False
442
+
443
+ # Strong malicious indicators
444
+ malicious_indicators = [
445
+ # Multiple consecutive suspicious segments
446
+ len([seg for seg in re.split(r"\W+", request.path) if seg in self.malicious_keywords]) > 1,
447
+
448
+ # Common attack patterns
449
+ any(pattern in request.path.lower() for pattern in [
450
+ '../', '..\\', '.env', 'wp-admin', 'phpmyadmin', 'config',
451
+ 'backup', 'database', 'mysql', 'passwd', 'shadow'
452
+ ]),
453
+
454
+ # Suspicious query parameters
455
+ any(param in request.GET for param in ['cmd', 'exec', 'system', 'shell']),
456
+
457
+ # Multiple directory traversal attempts
458
+ request.path.count('../') > 2 or request.path.count('..\\') > 2,
459
+
460
+ # Encoded attack patterns
461
+ any(encoded in request.path for encoded in ['%2e%2e', '%252e', '%c0%ae']),
462
+ ]
463
+
464
+ return any(malicious_indicators)
465
+
408
466
  def process_request(self, request):
409
467
  # First exemption check - early exit for exempt requests
410
468
  if is_exempt(request):
@@ -34,19 +34,34 @@ def path_exists_in_django(path: str) -> bool:
34
34
  from django.urls import get_resolver
35
35
  from django.urls.resolvers import URLResolver
36
36
 
37
- candidate = path.split("?")[0].lstrip("/")
37
+ candidate = path.split("?")[0].strip("/") # Remove query params and normalize slashes
38
+
39
+ # Try exact resolution first - this is the most reliable method
38
40
  try:
39
41
  get_resolver().resolve(f"/{candidate}")
40
42
  return True
41
43
  except:
42
44
  pass
43
-
44
- root = get_resolver()
45
- for p in root.url_patterns:
46
- if isinstance(p, URLResolver):
47
- prefix = p.pattern.describe().strip("^/")
48
- if prefix and candidate.startswith(prefix):
49
- return True
45
+
46
+ # Also try with trailing slash if it doesn't have one
47
+ if not candidate.endswith("/"):
48
+ try:
49
+ get_resolver().resolve(f"/{candidate}/")
50
+ return True
51
+ except:
52
+ pass
53
+
54
+ # Try without trailing slash if it has one
55
+ if candidate.endswith("/"):
56
+ try:
57
+ get_resolver().resolve(f"/{candidate.rstrip('/')}")
58
+ return True
59
+ except:
60
+ pass
61
+
62
+ # If direct resolution fails, be conservative
63
+ # Only do basic prefix matching for known include patterns
64
+ # but don't assume sub-paths exist just because the prefix exists
50
65
  return False
51
66
 
52
67
 
@@ -81,7 +96,7 @@ def get_legitimate_keywords() -> set:
81
96
  """Get all legitimate keywords that shouldn't be learned as suspicious"""
82
97
  legitimate = set()
83
98
 
84
- # Common legitimate path segments
99
+ # Common legitimate path segments - expanded set
85
100
  default_legitimate = {
86
101
  "profile", "user", "users", "account", "accounts", "settings", "dashboard",
87
102
  "home", "about", "contact", "help", "search", "list", "lists",
@@ -91,7 +106,32 @@ def get_legitimate_keywords() -> set:
91
106
  "category", "categories", "tag", "tags", "post", "posts",
92
107
  "article", "articles", "blog", "blogs", "news", "item", "items",
93
108
  "admin", "administration", "manage", "manager", "control", "panel",
94
- "config", "configuration", "option", "options", "preference", "preferences"
109
+ "config", "configuration", "option", "options", "preference", "preferences",
110
+
111
+ # Django built-in app keywords
112
+ "contenttypes", "contenttype", "sessions", "session", "messages", "message",
113
+ "staticfiles", "static", "sites", "site", "flatpages", "flatpage",
114
+ "redirects", "redirect", "permissions", "permission", "groups", "group",
115
+
116
+ # Common third-party package keywords
117
+ "token", "tokens", "oauth", "social", "rest", "framework", "cors",
118
+ "debug", "toolbar", "extensions", "allauth", "crispy", "forms",
119
+ "channels", "celery", "redis", "cache", "email", "mail",
120
+
121
+ # Common API/web development terms
122
+ "endpoint", "endpoints", "resource", "resources", "data", "export",
123
+ "import", "upload", "download", "file", "files", "media", "images",
124
+ "documents", "reports", "analytics", "stats", "statistics",
125
+
126
+ # Common business/application terms
127
+ "customer", "customers", "client", "clients", "company", "companies",
128
+ "department", "departments", "employee", "employees", "team", "teams",
129
+ "project", "projects", "task", "tasks", "event", "events",
130
+ "notification", "notifications", "alert", "alerts",
131
+
132
+ # Language/localization
133
+ "language", "languages", "locale", "locales", "translation", "translations",
134
+ "en", "fr", "de", "es", "it", "pt", "ru", "ja", "zh", "ko"
95
135
  }
96
136
  legitimate.update(default_legitimate)
97
137
 
@@ -120,30 +160,41 @@ def _extract_django_route_keywords() -> set:
120
160
 
121
161
  # Extract from app names and labels
122
162
  for app_config in apps.get_app_configs():
123
- # Add app name and label
163
+ # Add app name and label - improved parsing
124
164
  if app_config.name:
125
- for segment in re.split(r'[._-]', app_config.name.lower()):
126
- if len(segment) > 2:
127
- keywords.add(segment)
165
+ app_parts = app_config.name.lower().replace('-', '_').split('.')
166
+ for part in app_parts:
167
+ for segment in re.split(r'[._-]', part):
168
+ if len(segment) > 2:
169
+ keywords.add(segment)
128
170
 
129
171
  if app_config.label and app_config.label != app_config.name:
130
172
  for segment in re.split(r'[._-]', app_config.label.lower()):
131
173
  if len(segment) > 2:
132
174
  keywords.add(segment)
133
175
 
134
- # Extract from model names in the app
176
+ # Extract from model names in the app - improved handling
135
177
  try:
136
178
  for model in app_config.get_models():
137
179
  model_name = model._meta.model_name.lower()
138
180
  if len(model_name) > 2:
139
181
  keywords.add(model_name)
140
- # Add plural form
141
- if not model_name.endswith('s'):
142
- keywords.add(f"{model_name}s")
182
+ # Add plural form
183
+ if not model_name.endswith('s'):
184
+ keywords.add(f"{model_name}s")
185
+
186
+ # Also add verbose names if different
187
+ verbose_name = str(model._meta.verbose_name).lower()
188
+ verbose_name_plural = str(model._meta.verbose_name_plural).lower()
189
+
190
+ for name in [verbose_name, verbose_name_plural]:
191
+ for segment in re.split(r'[^a-zA-Z]+', name):
192
+ if len(segment) > 2 and segment != model_name:
193
+ keywords.add(segment)
143
194
  except Exception:
144
195
  continue
145
196
 
146
- # Extract from URL patterns
197
+ # Extract from URL patterns - improved extraction
147
198
  def extract_from_pattern(pattern, prefix=""):
148
199
  try:
149
200
  if isinstance(pattern, URLResolver):
@@ -154,26 +205,41 @@ def _extract_django_route_keywords() -> set:
154
205
  if len(segment) > 2:
155
206
  keywords.add(segment)
156
207
 
157
- # Extract from the pattern itself
208
+ # Extract from the pattern itself - more comprehensive
158
209
  pattern_str = str(pattern.pattern)
159
- for segment in re.findall(r'([a-zA-Z]\w{2,})', pattern_str):
160
- keywords.add(segment.lower())
210
+ # Get literal path segments (not regex parts)
211
+ literal_parts = re.findall(r'([a-zA-Z][a-zA-Z0-9_-]*)', pattern_str)
212
+ for part in literal_parts:
213
+ if len(part) > 2:
214
+ keywords.add(part.lower())
161
215
 
162
216
  # Recurse into nested patterns
163
- for nested_pattern in pattern.url_patterns:
164
- extract_from_pattern(nested_pattern, prefix)
217
+ try:
218
+ for nested_pattern in pattern.url_patterns:
219
+ extract_from_pattern(nested_pattern, prefix)
220
+ except:
221
+ pass
165
222
 
166
223
  elif isinstance(pattern, URLPattern):
167
- # Extract from URL pattern
224
+ # Extract from URL pattern - more comprehensive
168
225
  pattern_str = str(pattern.pattern)
169
- for segment in re.findall(r'([a-zA-Z]\w{2,})', pattern_str):
170
- keywords.add(segment.lower())
226
+ literal_parts = re.findall(r'([a-zA-Z][a-zA-Z0-9_-]*)', pattern_str)
227
+ for part in literal_parts:
228
+ if len(part) > 2:
229
+ keywords.add(part.lower())
171
230
 
172
231
  # Extract from view name if available
173
232
  if hasattr(pattern.callback, '__name__'):
174
233
  view_name = pattern.callback.__name__.lower()
175
234
  for segment in re.split(r'[._-]', view_name):
176
- if len(segment) > 2 and segment != 'view':
235
+ if len(segment) > 2 and segment not in ['view', 'class', 'function']:
236
+ keywords.add(segment)
237
+
238
+ # Extract from view class name if it's a class-based view
239
+ if hasattr(pattern.callback, 'view_class'):
240
+ class_name = pattern.callback.view_class.__name__.lower()
241
+ for segment in re.split(r'[._-]', class_name):
242
+ if len(segment) > 2 and segment not in ['view', 'class']:
177
243
  keywords.add(segment)
178
244
 
179
245
  except Exception:
@@ -188,10 +254,20 @@ def _extract_django_route_keywords() -> set:
188
254
  print(f"Warning: Could not extract Django route keywords: {e}")
189
255
 
190
256
  # Filter out very common/generic words that might be suspicious
257
+ # Expanded filter list
191
258
  filtered_keywords = set()
259
+ exclude_words = {
260
+ 'www', 'com', 'org', 'net', 'int', 'str', 'obj', 'get', 'set', 'put', 'del',
261
+ 'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her',
262
+ 'was', 'one', 'our', 'out', 'day', 'had', 'has', 'his', 'how', 'man', 'new',
263
+ 'now', 'old', 'see', 'two', 'who', 'boy', 'did', 'its', 'let', 'put', 'say',
264
+ 'she', 'too', 'use', 'var', 'way', 'may', 'end', 'why', 'any', 'app', 'run'
265
+ }
266
+
192
267
  for keyword in keywords:
193
268
  if (len(keyword) >= 3 and
194
- keyword not in ['www', 'com', 'org', 'net', 'int', 'str', 'obj', 'get', 'set', 'put', 'del']):
269
+ keyword not in exclude_words and
270
+ not keyword.isdigit()):
195
271
  filtered_keywords.add(keyword)
196
272
 
197
273
  if filtered_keywords:
@@ -272,6 +348,50 @@ def _parse(line: str) -> dict | None:
272
348
  }
273
349
 
274
350
 
351
+ def _is_malicious_context_trainer(path: str, keyword: str, status: str = "404") -> bool:
352
+ """
353
+ Determine if a keyword from log analysis appears in a malicious context.
354
+ This is the trainer version of the middleware's _is_malicious_context method.
355
+ """
356
+ # Don't learn from valid Django paths
357
+ if path_exists_in_django(path):
358
+ return False
359
+
360
+ # Strong malicious indicators for log analysis
361
+ malicious_indicators = [
362
+ # Multiple suspicious segments in path
363
+ len([seg for seg in re.split(r"\W+", path) if seg in STATIC_KW]) > 1,
364
+
365
+ # Common attack patterns
366
+ any(pattern in path.lower() for pattern in [
367
+ '../', '..\\', '.env', 'wp-admin', 'phpmyadmin', 'config',
368
+ 'backup', 'database', 'mysql', 'passwd', 'shadow', 'xmlrpc',
369
+ 'shell', 'cmd', 'exec', 'eval', 'system'
370
+ ]),
371
+
372
+ # Path indicates obvious attack attempt
373
+ any(attack in path.lower() for attack in [
374
+ 'union+select', 'drop+table', '<script', 'javascript:',
375
+ '${', '{{', 'onload=', 'onerror=', 'file://', 'http://'
376
+ ]),
377
+
378
+ # Multiple directory traversal attempts
379
+ path.count('../') > 1 or path.count('..\\') > 1,
380
+
381
+ # Encoded attack patterns
382
+ any(encoded in path for encoded in ['%2e%2e', '%252e', '%c0%ae', '%3c%73%63%72%69%70%74']),
383
+
384
+ # 404 status with suspicious characteristics
385
+ status == "404" and (
386
+ len(path) > 50 or # Very long paths are often attacks
387
+ path.count('/') > 10 or # Too many directory levels
388
+ any(c in path for c in ['<', '>', '{', '}', '$', '`']) # Special characters
389
+ ),
390
+ ]
391
+
392
+ return any(malicious_indicators)
393
+
394
+
275
395
  def train() -> None:
276
396
  """Enhanced training with improved keyword filtering and exemption handling"""
277
397
  print("🚀 Starting AIWAF enhanced training...")
@@ -436,28 +556,43 @@ def train() -> None:
436
556
  for seg in re.split(r"\W+", r["path"].lower()):
437
557
  if (len(seg) > 3 and
438
558
  seg not in STATIC_KW and
439
- seg not in legitimate_keywords): # Don't learn legitimate keywords
559
+ seg not in legitimate_keywords and # Don't learn legitimate keywords
560
+ _is_malicious_context_trainer(r["path"], seg, r["status"])): # Smart context check
440
561
  tokens[seg] += 1
441
562
 
442
563
  keyword_store = get_keyword_store()
443
564
  top_tokens = tokens.most_common(getattr(settings, "AIWAF_DYNAMIC_TOP_N", 10))
444
565
 
445
- # Additional filtering: only add keywords that appear suspicious enough
566
+ # Additional filtering: only add keywords that appear suspicious enough AND in malicious context
446
567
  filtered_tokens = []
568
+ learned_from_paths = [] # Track which paths we learned from
569
+
447
570
  for kw, cnt in top_tokens:
448
- # Don't add keywords that might be legitimate
571
+ # Find example paths where this keyword appeared
572
+ example_paths = [r["path"] for r in parsed
573
+ if kw in r["path"].lower() and
574
+ r["status"].startswith(("4", "5")) and
575
+ not path_exists_in_django(r["path"])]
576
+
577
+ # Only add if keyword appears in malicious contexts
449
578
  if (cnt >= 2 and # Must appear at least twice
450
579
  len(kw) >= 4 and # Must be at least 4 characters
451
- kw not in legitimate_keywords): # Not in legitimate set
580
+ kw not in legitimate_keywords and # Not in legitimate set
581
+ example_paths and # Has example paths
582
+ any(_is_malicious_context_trainer(path, kw) for path in example_paths[:3])): # Check first 3 paths
583
+
452
584
  filtered_tokens.append((kw, cnt))
453
585
  keyword_store.add_keyword(kw, cnt)
586
+ learned_from_paths.extend(example_paths[:2]) # Track first 2 example paths
454
587
 
455
588
  if filtered_tokens:
456
589
  print(f"📝 Added {len(filtered_tokens)} suspicious keywords: {[kw for kw, _ in filtered_tokens]}")
590
+ print(f"🎯 Example malicious paths learned from: {learned_from_paths[:5]}") # Show first 5
457
591
  else:
458
592
  print("✅ No new suspicious keywords learned (good sign!)")
459
593
 
460
- print(f"🎯 Dynamic keyword learning complete. Excluded {len(legitimate_keywords)} legitimate keywords.")
594
+ print(f"🎯 Smart keyword learning complete. Excluded {len(legitimate_keywords)} legitimate keywords.")
595
+ print(f"🔒 Used malicious context analysis to filter out false positives.")
461
596
 
462
597
  # Training summary
463
598
  print("\n" + "="*60)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aiwaf
3
- Version: 0.1.9.2.1
3
+ Version: 0.1.9.2.3
4
4
  Summary: AI-powered Web Application Firewall
5
5
  Home-page: https://github.com/aayushgauba/aiwaf
6
6
  Author: Aayush Gauba
@@ -22,6 +22,7 @@ aiwaf/management/commands/__init__.py
22
22
  aiwaf/management/commands/add_exemption.py
23
23
  aiwaf/management/commands/add_ipexemption.py
24
24
  aiwaf/management/commands/aiwaf_diagnose.py
25
+ aiwaf/management/commands/aiwaf_list.py
25
26
  aiwaf/management/commands/aiwaf_logging.py
26
27
  aiwaf/management/commands/aiwaf_reset.py
27
28
  aiwaf/management/commands/check_dependencies.py
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "aiwaf"
3
- version = "0.1.9.2.1"
3
+ version = "0.1.9.2.3"
4
4
  description = "AI-powered Web Application Firewall"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.8"
@@ -9,7 +9,7 @@ long_description = (HERE / "README.md").read_text(encoding="utf-8")
9
9
 
10
10
  setup(
11
11
  name="aiwaf",
12
- version="0.1.9.2.1",
12
+ version="0.1.9.2.3",
13
13
  description="AI‑driven, self‑learning Web Application Firewall for Django",
14
14
  long_description=long_description,
15
15
  long_description_content_type="text/markdown",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes