superlocalmemory 2.8.2 → 2.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -5
- package/api_server.py +5 -0
- package/bin/slm.bat +3 -3
- package/docs/SECURITY-QUICK-REFERENCE.md +214 -0
- package/install.ps1 +11 -11
- package/mcp_server.py +3 -3
- package/package.json +2 -2
- package/requirements-core.txt +16 -18
- package/requirements-learning.txt +8 -8
- package/requirements.txt +9 -7
- package/scripts/prepack.js +33 -0
- package/scripts/verify-v27.ps1 +301 -0
- package/src/agent_registry.py +32 -28
- package/src/auto_backup.py +12 -6
- package/src/cache_manager.py +2 -2
- package/src/compression/__init__.py +25 -0
- package/src/compression/cli.py +150 -0
- package/src/compression/cold_storage.py +217 -0
- package/src/compression/config.py +72 -0
- package/src/compression/orchestrator.py +133 -0
- package/src/compression/tier2_compressor.py +228 -0
- package/src/compression/tier3_compressor.py +153 -0
- package/src/compression/tier_classifier.py +148 -0
- package/src/db_connection_manager.py +5 -5
- package/src/event_bus.py +24 -22
- package/src/hnsw_index.py +3 -3
- package/src/learning/__init__.py +5 -4
- package/src/learning/adaptive_ranker.py +14 -265
- package/src/learning/bootstrap/__init__.py +69 -0
- package/src/learning/bootstrap/constants.py +93 -0
- package/src/learning/bootstrap/db_queries.py +316 -0
- package/src/learning/bootstrap/sampling.py +82 -0
- package/src/learning/bootstrap/text_utils.py +71 -0
- package/src/learning/cross_project_aggregator.py +58 -57
- package/src/learning/db/__init__.py +40 -0
- package/src/learning/db/constants.py +44 -0
- package/src/learning/db/schema.py +279 -0
- package/src/learning/learning_db.py +15 -234
- package/src/learning/ranking/__init__.py +33 -0
- package/src/learning/ranking/constants.py +84 -0
- package/src/learning/ranking/helpers.py +278 -0
- package/src/learning/source_quality_scorer.py +66 -65
- package/src/learning/synthetic_bootstrap.py +28 -310
- package/src/memory/__init__.py +36 -0
- package/src/memory/cli.py +205 -0
- package/src/memory/constants.py +39 -0
- package/src/memory/helpers.py +28 -0
- package/src/memory/schema.py +166 -0
- package/src/memory-profiles.py +94 -86
- package/src/memory-reset.py +187 -185
- package/src/memory_compression.py +2 -2
- package/src/memory_store_v2.py +34 -354
- package/src/migrate_v1_to_v2.py +11 -10
- package/src/patterns/analyzers.py +104 -100
- package/src/patterns/learner.py +17 -13
- package/src/patterns/scoring.py +25 -21
- package/src/patterns/store.py +40 -38
- package/src/patterns/terminology.py +53 -51
- package/src/provenance_tracker.py +2 -2
- package/src/qualixar_attribution.py +1 -1
- package/src/search/engine.py +16 -14
- package/src/search/index_loader.py +13 -11
- package/src/setup_validator.py +160 -158
- package/src/subscription_manager.py +20 -18
- package/src/tree/builder.py +66 -64
- package/src/tree/nodes.py +103 -97
- package/src/tree/queries.py +142 -137
- package/src/tree/schema.py +46 -42
- package/src/webhook_dispatcher.py +3 -3
- package/ui_server.py +7 -4
|
@@ -63,67 +63,17 @@ from .feature_extractor import FeatureExtractor, FEATURE_NAMES, NUM_FEATURES
|
|
|
63
63
|
|
|
64
64
|
logger = logging.getLogger("superlocalmemory.learning.adaptive_ranker")
|
|
65
65
|
|
|
66
|
-
#
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
'ml_model': 200, # 200+ feedback across 50+ unique queries -> ML
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
# Minimum unique queries required for ML phase (prevents overfitting
|
|
81
|
-
# to a small number of repeated queries)
|
|
82
|
-
MIN_UNIQUE_QUERIES_FOR_ML = 50
|
|
83
|
-
|
|
84
|
-
# Rule-based boost multipliers (Phase 1)
|
|
85
|
-
# These are conservative — they nudge the ranking without flipping order
|
|
86
|
-
_RULE_BOOST = {
|
|
87
|
-
'tech_match_strong': 1.3, # Memory matches 2+ preferred techs
|
|
88
|
-
'tech_match_weak': 1.1, # Memory matches 1 preferred tech
|
|
89
|
-
'project_match': 1.5, # Memory from current project
|
|
90
|
-
'project_unknown': 1.0, # No project context — no boost
|
|
91
|
-
'project_mismatch': 0.9, # Memory from different project
|
|
92
|
-
'source_quality_high': 1.2, # Source quality > 0.7
|
|
93
|
-
'source_quality_low': 0.85, # Source quality < 0.3
|
|
94
|
-
'recency_boost_max': 1.2, # Recent memory (< 7 days)
|
|
95
|
-
'recency_penalty_max': 0.8, # Old memory (> 365 days)
|
|
96
|
-
'high_importance': 1.15, # Importance >= 8
|
|
97
|
-
'high_access': 1.1, # Accessed 5+ times
|
|
98
|
-
# v2.8: Lifecycle + behavioral boosts
|
|
99
|
-
'lifecycle_active': 1.0,
|
|
100
|
-
'lifecycle_warm': 0.85,
|
|
101
|
-
'lifecycle_cold': 0.6,
|
|
102
|
-
'outcome_success_high': 1.3,
|
|
103
|
-
'outcome_failure_high': 0.7,
|
|
104
|
-
'behavioral_match_strong': 1.25,
|
|
105
|
-
'cross_project_boost': 1.15,
|
|
106
|
-
'high_trust_creator': 1.1,
|
|
107
|
-
'low_trust_creator': 0.8,
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
# LightGBM training parameters — tuned for small, personal datasets
|
|
111
|
-
# Aggressive regularization prevents overfitting on < 10K samples
|
|
112
|
-
TRAINING_PARAMS = {
|
|
113
|
-
'objective': 'lambdarank',
|
|
114
|
-
'metric': 'ndcg',
|
|
115
|
-
'ndcg_eval_at': [5, 10],
|
|
116
|
-
'learning_rate': 0.05,
|
|
117
|
-
'num_leaves': 16,
|
|
118
|
-
'max_depth': 4,
|
|
119
|
-
'min_child_samples': 10,
|
|
120
|
-
'subsample': 0.8,
|
|
121
|
-
'reg_alpha': 0.1,
|
|
122
|
-
'reg_lambda': 1.0,
|
|
123
|
-
'boosting_type': 'dart',
|
|
124
|
-
'n_estimators': 50,
|
|
125
|
-
'verbose': -1,
|
|
126
|
-
}
|
|
66
|
+
# Import constants and helpers from ranking subpackage
|
|
67
|
+
from .ranking import (
|
|
68
|
+
MODELS_DIR,
|
|
69
|
+
MODEL_PATH,
|
|
70
|
+
PHASE_THRESHOLDS,
|
|
71
|
+
MIN_UNIQUE_QUERIES_FOR_ML,
|
|
72
|
+
RULE_BOOST,
|
|
73
|
+
TRAINING_PARAMS,
|
|
74
|
+
calculate_rule_boost,
|
|
75
|
+
prepare_training_data_internal,
|
|
76
|
+
)
|
|
127
77
|
|
|
128
78
|
|
|
129
79
|
class AdaptiveRanker:
|
|
@@ -373,102 +323,7 @@ class AdaptiveRanker:
|
|
|
373
323
|
continue
|
|
374
324
|
|
|
375
325
|
features = feature_vectors[i]
|
|
376
|
-
boost =
|
|
377
|
-
|
|
378
|
-
# Feature [2]: tech_match
|
|
379
|
-
tech_match = features[2]
|
|
380
|
-
if tech_match >= 0.8:
|
|
381
|
-
boost *= _RULE_BOOST['tech_match_strong']
|
|
382
|
-
elif tech_match >= 0.4:
|
|
383
|
-
boost *= _RULE_BOOST['tech_match_weak']
|
|
384
|
-
|
|
385
|
-
# Feature [3]: project_match
|
|
386
|
-
project_match = features[3]
|
|
387
|
-
if project_match >= 0.9:
|
|
388
|
-
boost *= _RULE_BOOST['project_match']
|
|
389
|
-
elif project_match <= 0.35:
|
|
390
|
-
boost *= _RULE_BOOST['project_mismatch']
|
|
391
|
-
|
|
392
|
-
# Feature [5]: source_quality
|
|
393
|
-
source_quality = features[5]
|
|
394
|
-
if source_quality >= 0.7:
|
|
395
|
-
boost *= _RULE_BOOST['source_quality_high']
|
|
396
|
-
elif source_quality < 0.3:
|
|
397
|
-
boost *= _RULE_BOOST['source_quality_low']
|
|
398
|
-
|
|
399
|
-
# Feature [7]: recency_score (exponential decay)
|
|
400
|
-
recency = features[7]
|
|
401
|
-
# Linear interpolation between penalty and boost
|
|
402
|
-
recency_factor = (
|
|
403
|
-
_RULE_BOOST['recency_penalty_max']
|
|
404
|
-
+ recency * (
|
|
405
|
-
_RULE_BOOST['recency_boost_max']
|
|
406
|
-
- _RULE_BOOST['recency_penalty_max']
|
|
407
|
-
)
|
|
408
|
-
)
|
|
409
|
-
boost *= recency_factor
|
|
410
|
-
|
|
411
|
-
# Feature [6]: importance_norm
|
|
412
|
-
importance_norm = features[6]
|
|
413
|
-
if importance_norm >= 0.8:
|
|
414
|
-
boost *= _RULE_BOOST['high_importance']
|
|
415
|
-
|
|
416
|
-
# Feature [8]: access_frequency
|
|
417
|
-
access_freq = features[8]
|
|
418
|
-
if access_freq >= 0.5:
|
|
419
|
-
boost *= _RULE_BOOST['high_access']
|
|
420
|
-
|
|
421
|
-
# Feature [10]: signal_count (v2.7.4 — feedback volume)
|
|
422
|
-
if len(features) > 10:
|
|
423
|
-
signal_count = features[10]
|
|
424
|
-
if signal_count >= 0.3: # 3+ signals
|
|
425
|
-
boost *= 1.1 # Mild boost for well-known memories
|
|
426
|
-
|
|
427
|
-
# Feature [11]: avg_signal_value (v2.7.4 — feedback quality)
|
|
428
|
-
if len(features) > 11:
|
|
429
|
-
avg_signal = features[11]
|
|
430
|
-
if avg_signal >= 0.7:
|
|
431
|
-
boost *= 1.15 # Boost memories with positive feedback
|
|
432
|
-
elif avg_signal < 0.3 and avg_signal > 0.0:
|
|
433
|
-
boost *= 0.85 # Penalize memories with negative feedback
|
|
434
|
-
|
|
435
|
-
# Feature [12]: lifecycle_state (v2.8)
|
|
436
|
-
if len(features) > 12:
|
|
437
|
-
lifecycle_state = features[12]
|
|
438
|
-
if lifecycle_state >= 0.9:
|
|
439
|
-
boost *= _RULE_BOOST.get('lifecycle_active', 1.0)
|
|
440
|
-
elif lifecycle_state >= 0.6:
|
|
441
|
-
boost *= _RULE_BOOST.get('lifecycle_warm', 0.85)
|
|
442
|
-
elif lifecycle_state >= 0.3:
|
|
443
|
-
boost *= _RULE_BOOST.get('lifecycle_cold', 0.6)
|
|
444
|
-
|
|
445
|
-
# Feature [13]: outcome_success_rate (v2.8)
|
|
446
|
-
if len(features) > 13:
|
|
447
|
-
success_rate = features[13]
|
|
448
|
-
if success_rate >= 0.8:
|
|
449
|
-
boost *= _RULE_BOOST.get('outcome_success_high', 1.3)
|
|
450
|
-
elif success_rate <= 0.2:
|
|
451
|
-
boost *= _RULE_BOOST.get('outcome_failure_high', 0.7)
|
|
452
|
-
|
|
453
|
-
# Feature [15]: behavioral_match (v2.8)
|
|
454
|
-
if len(features) > 15:
|
|
455
|
-
behavioral = features[15]
|
|
456
|
-
if behavioral >= 0.7:
|
|
457
|
-
boost *= _RULE_BOOST.get('behavioral_match_strong', 1.25)
|
|
458
|
-
|
|
459
|
-
# Feature [16]: cross_project_score (v2.8)
|
|
460
|
-
if len(features) > 16:
|
|
461
|
-
cross_project = features[16]
|
|
462
|
-
if cross_project >= 0.5:
|
|
463
|
-
boost *= _RULE_BOOST.get('cross_project_boost', 1.15)
|
|
464
|
-
|
|
465
|
-
# Feature [18]: trust_at_creation (v2.8)
|
|
466
|
-
if len(features) > 18:
|
|
467
|
-
trust = features[18]
|
|
468
|
-
if trust >= 0.9:
|
|
469
|
-
boost *= _RULE_BOOST.get('high_trust_creator', 1.1)
|
|
470
|
-
elif trust <= 0.3:
|
|
471
|
-
boost *= _RULE_BOOST.get('low_trust_creator', 0.8)
|
|
326
|
+
boost = calculate_rule_boost(features)
|
|
472
327
|
|
|
473
328
|
# Apply boost to score
|
|
474
329
|
result['score'] = base_score * boost
|
|
@@ -799,12 +654,10 @@ class AdaptiveRanker:
|
|
|
799
654
|
|
|
800
655
|
Returns:
|
|
801
656
|
Tuple of (X, y, groups) for LGBMRanker, or None if insufficient.
|
|
802
|
-
X: numpy array (n_samples,
|
|
657
|
+
X: numpy array (n_samples, NUM_FEATURES)
|
|
803
658
|
y: numpy array (n_samples,) — relevance labels
|
|
804
659
|
groups: list of ints — samples per query group
|
|
805
660
|
"""
|
|
806
|
-
import sqlite3
|
|
807
|
-
|
|
808
661
|
ldb = self._get_learning_db()
|
|
809
662
|
if ldb is None:
|
|
810
663
|
return None
|
|
@@ -813,111 +666,7 @@ class AdaptiveRanker:
|
|
|
813
666
|
if not feedback:
|
|
814
667
|
return None
|
|
815
668
|
|
|
816
|
-
|
|
817
|
-
query_groups: Dict[str, List[dict]] = {}
|
|
818
|
-
for entry in feedback:
|
|
819
|
-
qh = entry['query_hash']
|
|
820
|
-
if qh not in query_groups:
|
|
821
|
-
query_groups[qh] = []
|
|
822
|
-
query_groups[qh].append(entry)
|
|
823
|
-
|
|
824
|
-
# Filter: only keep groups with 2+ items (ranking requires pairs)
|
|
825
|
-
query_groups = {
|
|
826
|
-
qh: entries for qh, entries in query_groups.items()
|
|
827
|
-
if len(entries) >= 2
|
|
828
|
-
}
|
|
829
|
-
|
|
830
|
-
if not query_groups:
|
|
831
|
-
logger.info("No query groups with 2+ feedback entries")
|
|
832
|
-
return None
|
|
833
|
-
|
|
834
|
-
# Collect memory IDs we need to look up
|
|
835
|
-
memory_ids_needed = set()
|
|
836
|
-
for entries in query_groups.values():
|
|
837
|
-
for entry in entries:
|
|
838
|
-
memory_ids_needed.add(entry['memory_id'])
|
|
839
|
-
|
|
840
|
-
# Fetch memories from memory.db
|
|
841
|
-
memory_db_path = Path.home() / ".claude-memory" / "memory.db"
|
|
842
|
-
if not memory_db_path.exists():
|
|
843
|
-
logger.warning("memory.db not found at %s", memory_db_path)
|
|
844
|
-
return None
|
|
845
|
-
|
|
846
|
-
memories_by_id = {}
|
|
847
|
-
try:
|
|
848
|
-
conn = sqlite3.connect(str(memory_db_path), timeout=5)
|
|
849
|
-
conn.row_factory = sqlite3.Row
|
|
850
|
-
cursor = conn.cursor()
|
|
851
|
-
|
|
852
|
-
# Batch fetch memories (in chunks to avoid SQLite variable limit)
|
|
853
|
-
id_list = list(memory_ids_needed)
|
|
854
|
-
chunk_size = 500
|
|
855
|
-
for i in range(0, len(id_list), chunk_size):
|
|
856
|
-
chunk = id_list[i:i + chunk_size]
|
|
857
|
-
placeholders = ','.join('?' for _ in chunk)
|
|
858
|
-
cursor.execute(f'''
|
|
859
|
-
SELECT id, content, summary, project_path, project_name,
|
|
860
|
-
tags, category, memory_type, importance, created_at,
|
|
861
|
-
last_accessed, access_count
|
|
862
|
-
FROM memories
|
|
863
|
-
WHERE id IN ({placeholders})
|
|
864
|
-
''', chunk)
|
|
865
|
-
for row in cursor.fetchall():
|
|
866
|
-
memories_by_id[row['id']] = dict(row)
|
|
867
|
-
conn.close()
|
|
868
|
-
except Exception as e:
|
|
869
|
-
logger.error("Failed to fetch memories for training: %s", e)
|
|
870
|
-
return None
|
|
871
|
-
|
|
872
|
-
# Build feature matrix and labels
|
|
873
|
-
all_features = []
|
|
874
|
-
all_labels = []
|
|
875
|
-
groups = []
|
|
876
|
-
|
|
877
|
-
# Set a neutral context for training (we don't have query-time context)
|
|
878
|
-
self._feature_extractor.set_context()
|
|
879
|
-
|
|
880
|
-
for qh, entries in query_groups.items():
|
|
881
|
-
group_features = []
|
|
882
|
-
group_labels = []
|
|
883
|
-
|
|
884
|
-
for entry in entries:
|
|
885
|
-
mid = entry['memory_id']
|
|
886
|
-
memory = memories_by_id.get(mid)
|
|
887
|
-
if memory is None:
|
|
888
|
-
continue # Memory may have been deleted
|
|
889
|
-
|
|
890
|
-
# Use query_keywords as proxy for query text
|
|
891
|
-
query_text = entry.get('query_keywords', '') or ''
|
|
892
|
-
|
|
893
|
-
features = self._feature_extractor.extract_features(
|
|
894
|
-
memory, query_text
|
|
895
|
-
)
|
|
896
|
-
group_features.append(features)
|
|
897
|
-
group_labels.append(float(entry['signal_value']))
|
|
898
|
-
|
|
899
|
-
# Only include groups with 2+ valid entries
|
|
900
|
-
if len(group_features) >= 2:
|
|
901
|
-
all_features.extend(group_features)
|
|
902
|
-
all_labels.extend(group_labels)
|
|
903
|
-
groups.append(len(group_features))
|
|
904
|
-
|
|
905
|
-
if not groups or len(all_features) < 4:
|
|
906
|
-
logger.info(
|
|
907
|
-
"Insufficient valid training data: %d features, %d groups",
|
|
908
|
-
len(all_features), len(groups)
|
|
909
|
-
)
|
|
910
|
-
return None
|
|
911
|
-
|
|
912
|
-
X = np.array(all_features, dtype=np.float64)
|
|
913
|
-
y = np.array(all_labels, dtype=np.float64)
|
|
914
|
-
|
|
915
|
-
logger.info(
|
|
916
|
-
"Prepared training data: %d samples, %d groups, %d features",
|
|
917
|
-
X.shape[0], len(groups), X.shape[1]
|
|
918
|
-
)
|
|
919
|
-
|
|
920
|
-
return X, y, groups
|
|
669
|
+
return prepare_training_data_internal(feedback, self._feature_extractor)
|
|
921
670
|
|
|
922
671
|
|
|
923
672
|
# ============================================================================
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
# Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
|
|
4
|
+
"""
|
|
5
|
+
Bootstrap utilities package.
|
|
6
|
+
|
|
7
|
+
Re-exports all constants, functions, and utilities used by SyntheticBootstrapper.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
# Constants
|
|
11
|
+
from .constants import (
|
|
12
|
+
MEMORY_DB_PATH,
|
|
13
|
+
MODELS_DIR,
|
|
14
|
+
MODEL_PATH,
|
|
15
|
+
MIN_MEMORIES_FOR_BOOTSTRAP,
|
|
16
|
+
BOOTSTRAP_CONFIG,
|
|
17
|
+
BOOTSTRAP_PARAMS,
|
|
18
|
+
STOPWORDS,
|
|
19
|
+
MIN_KEYWORD_LENGTH,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# Text utilities
|
|
23
|
+
from .text_utils import (
|
|
24
|
+
extract_keywords,
|
|
25
|
+
clean_fts_query,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
# Database queries
|
|
29
|
+
from .db_queries import (
|
|
30
|
+
get_memory_count,
|
|
31
|
+
get_memories_by_access,
|
|
32
|
+
get_memories_by_importance,
|
|
33
|
+
get_recent_memories,
|
|
34
|
+
get_learned_patterns,
|
|
35
|
+
search_memories,
|
|
36
|
+
find_negative_memories,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Sampling utilities
|
|
40
|
+
from .sampling import (
|
|
41
|
+
diverse_sample,
|
|
42
|
+
count_sources,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
__all__ = [
|
|
46
|
+
# Constants
|
|
47
|
+
'MEMORY_DB_PATH',
|
|
48
|
+
'MODELS_DIR',
|
|
49
|
+
'MODEL_PATH',
|
|
50
|
+
'MIN_MEMORIES_FOR_BOOTSTRAP',
|
|
51
|
+
'BOOTSTRAP_CONFIG',
|
|
52
|
+
'BOOTSTRAP_PARAMS',
|
|
53
|
+
'STOPWORDS',
|
|
54
|
+
'MIN_KEYWORD_LENGTH',
|
|
55
|
+
# Text utilities
|
|
56
|
+
'extract_keywords',
|
|
57
|
+
'clean_fts_query',
|
|
58
|
+
# Database queries
|
|
59
|
+
'get_memory_count',
|
|
60
|
+
'get_memories_by_access',
|
|
61
|
+
'get_memories_by_importance',
|
|
62
|
+
'get_recent_memories',
|
|
63
|
+
'get_learned_patterns',
|
|
64
|
+
'search_memories',
|
|
65
|
+
'find_negative_memories',
|
|
66
|
+
# Sampling
|
|
67
|
+
'diverse_sample',
|
|
68
|
+
'count_sources',
|
|
69
|
+
]
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
# Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
|
|
4
|
+
"""
|
|
5
|
+
Bootstrap constants and configuration.
|
|
6
|
+
|
|
7
|
+
All constant values, configuration dicts, and static data used
|
|
8
|
+
by SyntheticBootstrapper are defined here.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
# ============================================================================
|
|
14
|
+
# Paths
|
|
15
|
+
# ============================================================================
|
|
16
|
+
|
|
17
|
+
MEMORY_DB_PATH = Path.home() / ".claude-memory" / "memory.db"
|
|
18
|
+
MODELS_DIR = Path.home() / ".claude-memory" / "models"
|
|
19
|
+
MODEL_PATH = MODELS_DIR / "ranker.txt"
|
|
20
|
+
|
|
21
|
+
# ============================================================================
|
|
22
|
+
# Bootstrap Configuration
|
|
23
|
+
# ============================================================================
|
|
24
|
+
|
|
25
|
+
# Minimum memories needed before bootstrap makes sense
|
|
26
|
+
MIN_MEMORIES_FOR_BOOTSTRAP = 50
|
|
27
|
+
|
|
28
|
+
# Tiered config — bootstrap model complexity scales with data size
|
|
29
|
+
BOOTSTRAP_CONFIG = {
|
|
30
|
+
'small': {
|
|
31
|
+
'min_memories': 50,
|
|
32
|
+
'max_memories': 499,
|
|
33
|
+
'target_samples': 200,
|
|
34
|
+
'n_estimators': 30,
|
|
35
|
+
'max_depth': 3,
|
|
36
|
+
},
|
|
37
|
+
'medium': {
|
|
38
|
+
'min_memories': 500,
|
|
39
|
+
'max_memories': 4999,
|
|
40
|
+
'target_samples': 1000,
|
|
41
|
+
'n_estimators': 50,
|
|
42
|
+
'max_depth': 4,
|
|
43
|
+
},
|
|
44
|
+
'large': {
|
|
45
|
+
'min_memories': 5000,
|
|
46
|
+
'max_memories': float('inf'),
|
|
47
|
+
'target_samples': 2000,
|
|
48
|
+
'n_estimators': 100,
|
|
49
|
+
'max_depth': 6,
|
|
50
|
+
},
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# ============================================================================
|
|
54
|
+
# LightGBM Parameters
|
|
55
|
+
# ============================================================================
|
|
56
|
+
|
|
57
|
+
# LightGBM bootstrap parameters — MORE aggressive regularization than
|
|
58
|
+
# real training because synthetic data has systematic biases
|
|
59
|
+
BOOTSTRAP_PARAMS = {
|
|
60
|
+
'objective': 'lambdarank',
|
|
61
|
+
'metric': 'ndcg',
|
|
62
|
+
'ndcg_eval_at': [5, 10],
|
|
63
|
+
'learning_rate': 0.1,
|
|
64
|
+
'num_leaves': 8,
|
|
65
|
+
'max_depth': 3,
|
|
66
|
+
'min_child_samples': 5,
|
|
67
|
+
'subsample': 0.7,
|
|
68
|
+
'reg_alpha': 0.5,
|
|
69
|
+
'reg_lambda': 2.0,
|
|
70
|
+
'boosting_type': 'dart',
|
|
71
|
+
'verbose': -1,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
# ============================================================================
|
|
75
|
+
# Text Processing
|
|
76
|
+
# ============================================================================
|
|
77
|
+
|
|
78
|
+
# English stopwords for keyword extraction (no external deps)
|
|
79
|
+
STOPWORDS = frozenset({
|
|
80
|
+
'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
81
|
+
'of', 'with', 'by', 'from', 'is', 'it', 'this', 'that', 'was', 'are',
|
|
82
|
+
'be', 'has', 'have', 'had', 'do', 'does', 'did', 'will', 'would',
|
|
83
|
+
'could', 'should', 'may', 'might', 'can', 'not', 'no', 'if', 'then',
|
|
84
|
+
'so', 'as', 'up', 'out', 'about', 'into', 'over', 'after', 'before',
|
|
85
|
+
'when', 'where', 'how', 'what', 'which', 'who', 'whom', 'why',
|
|
86
|
+
'all', 'each', 'every', 'both', 'few', 'more', 'most', 'other',
|
|
87
|
+
'some', 'such', 'than', 'too', 'very', 'just', 'also', 'now',
|
|
88
|
+
'here', 'there', 'use', 'used', 'using', 'make', 'made',
|
|
89
|
+
'need', 'needed', 'get', 'got', 'set', 'new', 'old', 'one', 'two',
|
|
90
|
+
})
|
|
91
|
+
|
|
92
|
+
# Minimum word length for keyword extraction
|
|
93
|
+
MIN_KEYWORD_LENGTH = 3
|