analyxa 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analyxa/__init__.py +8 -0
- analyxa/analyzer.py +183 -0
- analyxa/batch.py +185 -0
- analyxa/cli.py +367 -0
- analyxa/config.py +149 -0
- analyxa/embeddings.py +71 -0
- analyxa/llm_client.py +229 -0
- analyxa/prompt_builder.py +165 -0
- analyxa/schema.py +155 -0
- analyxa/schemas/coaching.yaml +165 -0
- analyxa/schemas/sales.yaml +100 -0
- analyxa/schemas/support.yaml +81 -0
- analyxa/schemas/universal.yaml +153 -0
- analyxa/sinks/__init__.py +0 -0
- analyxa/sinks/json_sink.py +20 -0
- analyxa/sinks/qdrant_sink.py +173 -0
- analyxa/sinks/stdout_sink.py +11 -0
- analyxa/sources/__init__.py +0 -0
- analyxa/sources/file_source.py +47 -0
- analyxa/sources/redis_source.py +158 -0
- analyxa-0.1.0.dist-info/METADATA +242 -0
- analyxa-0.1.0.dist-info/RECORD +26 -0
- analyxa-0.1.0.dist-info/WHEEL +5 -0
- analyxa-0.1.0.dist-info/entry_points.txt +2 -0
- analyxa-0.1.0.dist-info/licenses/LICENSE +191 -0
- analyxa-0.1.0.dist-info/top_level.txt +1 -0
analyxa/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Analyxa — Multi-dimensional extraction engine for AI conversations."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.0"
|
|
4
|
+
|
|
5
|
+
from analyxa.analyzer import Analyzer, AnalysisResult, analyze
|
|
6
|
+
from analyxa.schema import SchemaManager
|
|
7
|
+
|
|
8
|
+
__all__ = ["Analyzer", "AnalysisResult", "analyze", "SchemaManager", "__version__"]
|
analyxa/analyzer.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""Analyzer — main pipeline orchestrator for Analyxa."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
|
|
7
|
+
from analyxa.schema import SchemaManager
|
|
8
|
+
from analyxa.prompt_builder import build_prompt
|
|
9
|
+
from analyxa.llm_client import LLMClient, LLMResponse
|
|
10
|
+
from analyxa.embeddings import EmbeddingGenerator
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class AnalysisResult:
|
|
15
|
+
"""Full result of a conversation analysis run."""
|
|
16
|
+
|
|
17
|
+
# Fields extracted by the LLM
|
|
18
|
+
fields: dict
|
|
19
|
+
|
|
20
|
+
# Schema metadata
|
|
21
|
+
schema_name: str
|
|
22
|
+
schema_version: str
|
|
23
|
+
|
|
24
|
+
# Auto-computed metadata
|
|
25
|
+
analyzed_at: str # ISO 8601 UTC timestamp
|
|
26
|
+
analysis_model: str # LLM model used
|
|
27
|
+
embedding_model: str | None
|
|
28
|
+
session_length: int # Approximate line count
|
|
29
|
+
conversation_hash: str # SHA-256 truncated to 16 hex chars
|
|
30
|
+
|
|
31
|
+
# Semantic vector
|
|
32
|
+
embedding: list[float] | None
|
|
33
|
+
|
|
34
|
+
# Execution metadata
|
|
35
|
+
llm_response: LLMResponse
|
|
36
|
+
validation_errors: list[str]
|
|
37
|
+
|
|
38
|
+
def to_dict(self) -> dict:
|
|
39
|
+
"""Full dict with fields + nested _meta (for Qdrant/storage)."""
|
|
40
|
+
result = {}
|
|
41
|
+
result.update(self.fields)
|
|
42
|
+
result["_meta"] = {
|
|
43
|
+
"schema_name": self.schema_name,
|
|
44
|
+
"schema_version": self.schema_version,
|
|
45
|
+
"analyzed_at": self.analyzed_at,
|
|
46
|
+
"analysis_model": self.analysis_model,
|
|
47
|
+
"embedding_model": self.embedding_model,
|
|
48
|
+
"session_length": self.session_length,
|
|
49
|
+
"conversation_hash": self.conversation_hash,
|
|
50
|
+
"has_embedding": self.embedding is not None,
|
|
51
|
+
"input_tokens": self.llm_response.input_tokens,
|
|
52
|
+
"output_tokens": self.llm_response.output_tokens,
|
|
53
|
+
"latency_ms": self.llm_response.latency_ms,
|
|
54
|
+
"validation_errors": self.validation_errors,
|
|
55
|
+
}
|
|
56
|
+
return result
|
|
57
|
+
|
|
58
|
+
def to_flat_dict(self) -> dict:
|
|
59
|
+
"""Flat dict with fields + meta at same level, no embedding (for JSON/CSV export)."""
|
|
60
|
+
result = {}
|
|
61
|
+
result.update(self.fields)
|
|
62
|
+
result["schema_name"] = self.schema_name
|
|
63
|
+
result["schema_version"] = self.schema_version
|
|
64
|
+
result["analyzed_at"] = self.analyzed_at
|
|
65
|
+
result["analysis_model"] = self.analysis_model
|
|
66
|
+
result["session_length"] = self.session_length
|
|
67
|
+
result["conversation_hash"] = self.conversation_hash
|
|
68
|
+
return result
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class Analyzer:
|
|
72
|
+
"""Orchestrates the full analysis pipeline.
|
|
73
|
+
|
|
74
|
+
Pipeline:
|
|
75
|
+
conversation → build_prompt → LLMClient → validate → embed → AnalysisResult
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
schema_name: str = "universal",
|
|
81
|
+
provider: str = "anthropic",
|
|
82
|
+
model: str | None = None,
|
|
83
|
+
api_key: str | None = None,
|
|
84
|
+
enable_embeddings: bool = True,
|
|
85
|
+
embedding_api_key: str | None = None,
|
|
86
|
+
) -> None:
|
|
87
|
+
self.schema_name = schema_name
|
|
88
|
+
self.schema_manager = SchemaManager()
|
|
89
|
+
self.schema = self.schema_manager.load_schema(schema_name)
|
|
90
|
+
self.llm_client = LLMClient(provider=provider, model=model, api_key=api_key)
|
|
91
|
+
|
|
92
|
+
if enable_embeddings:
|
|
93
|
+
self.embedding_generator: EmbeddingGenerator | None = EmbeddingGenerator(
|
|
94
|
+
api_key=embedding_api_key
|
|
95
|
+
)
|
|
96
|
+
else:
|
|
97
|
+
self.embedding_generator = None
|
|
98
|
+
|
|
99
|
+
def analyze(
|
|
100
|
+
self, conversation: str, context: dict | None = None
|
|
101
|
+
) -> AnalysisResult:
|
|
102
|
+
"""Run the full pipeline on a conversation string."""
|
|
103
|
+
# Step 1 — build prompt
|
|
104
|
+
prompt = build_prompt(self.schema, conversation, context)
|
|
105
|
+
|
|
106
|
+
# Step 2 — call LLM
|
|
107
|
+
llm_response = self.llm_client.analyze(prompt)
|
|
108
|
+
|
|
109
|
+
# Compute auto-fields (used regardless of LLM success)
|
|
110
|
+
analyzed_at = datetime.now(timezone.utc).isoformat()
|
|
111
|
+
session_length = conversation.count("\n") + 1
|
|
112
|
+
conversation_hash = hashlib.sha256(conversation.encode()).hexdigest()[:16]
|
|
113
|
+
|
|
114
|
+
# Step 3 — handle LLM failure
|
|
115
|
+
if not llm_response.success or llm_response.parsed_json is None:
|
|
116
|
+
return AnalysisResult(
|
|
117
|
+
fields={},
|
|
118
|
+
schema_name=self.schema_name,
|
|
119
|
+
schema_version=self.schema["metadata"]["version"],
|
|
120
|
+
analyzed_at=analyzed_at,
|
|
121
|
+
analysis_model=self.llm_client.model,
|
|
122
|
+
embedding_model=None,
|
|
123
|
+
session_length=session_length,
|
|
124
|
+
conversation_hash=conversation_hash,
|
|
125
|
+
embedding=None,
|
|
126
|
+
llm_response=llm_response,
|
|
127
|
+
validation_errors=[
|
|
128
|
+
f"LLM failed: {llm_response.error or 'Could not parse JSON'}"
|
|
129
|
+
],
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Step 4 — validate against schema
|
|
133
|
+
_is_valid, errors = self.schema_manager.validate_result(
|
|
134
|
+
self.schema_name, llm_response.parsed_json
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Step 5 — generate embedding from summary field
|
|
138
|
+
embedding = None
|
|
139
|
+
embedding_model = None
|
|
140
|
+
if self.embedding_generator:
|
|
141
|
+
summary = llm_response.parsed_json.get("summary", "")
|
|
142
|
+
if summary:
|
|
143
|
+
embedding = self.embedding_generator.generate(summary)
|
|
144
|
+
if embedding is not None:
|
|
145
|
+
embedding_model = "text-embedding-3-small"
|
|
146
|
+
|
|
147
|
+
# Step 6 — build and return result
|
|
148
|
+
return AnalysisResult(
|
|
149
|
+
fields=llm_response.parsed_json,
|
|
150
|
+
schema_name=self.schema_name,
|
|
151
|
+
schema_version=self.schema["metadata"]["version"],
|
|
152
|
+
analyzed_at=analyzed_at,
|
|
153
|
+
analysis_model=self.llm_client.model,
|
|
154
|
+
embedding_model=embedding_model,
|
|
155
|
+
session_length=session_length,
|
|
156
|
+
conversation_hash=conversation_hash,
|
|
157
|
+
embedding=embedding,
|
|
158
|
+
llm_response=llm_response,
|
|
159
|
+
validation_errors=errors,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def analyze(
|
|
164
|
+
conversation: str,
|
|
165
|
+
schema: str = "universal",
|
|
166
|
+
provider: str = "anthropic",
|
|
167
|
+
model: str | None = None,
|
|
168
|
+
context: dict | None = None,
|
|
169
|
+
enable_embeddings: bool = True,
|
|
170
|
+
) -> AnalysisResult:
|
|
171
|
+
"""Convenience function for quick one-shot analysis.
|
|
172
|
+
|
|
173
|
+
Usage:
|
|
174
|
+
from analyxa import analyze
|
|
175
|
+
result = analyze("User: hello\\nAgent: hi", schema="support")
|
|
176
|
+
"""
|
|
177
|
+
analyzer = Analyzer(
|
|
178
|
+
schema_name=schema,
|
|
179
|
+
provider=provider,
|
|
180
|
+
model=model,
|
|
181
|
+
enable_embeddings=enable_embeddings,
|
|
182
|
+
)
|
|
183
|
+
return analyzer.analyze(conversation, context=context)
|
analyxa/batch.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""Batch processor — analyzes multiple conversations in sequence."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import Any, Callable
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class BatchResult:
|
|
10
|
+
"""Result of a batch analysis run."""
|
|
11
|
+
|
|
12
|
+
total: int
|
|
13
|
+
successful: int
|
|
14
|
+
failed: int
|
|
15
|
+
results: list # list[AnalysisResult]
|
|
16
|
+
errors: list[dict] # [{"id": str, "error": str}]
|
|
17
|
+
elapsed_seconds: float
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def success_rate(self) -> float:
|
|
21
|
+
return self.successful / self.total if self.total > 0 else 0.0
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def batch_analyze(
|
|
25
|
+
conversations: list[dict],
|
|
26
|
+
schema_name: str = "universal",
|
|
27
|
+
provider: str = "anthropic",
|
|
28
|
+
model: str | None = None,
|
|
29
|
+
enable_embeddings: bool = True,
|
|
30
|
+
sink: Any | None = None,
|
|
31
|
+
on_progress: Callable | None = None,
|
|
32
|
+
) -> BatchResult:
|
|
33
|
+
"""Analyze a list of conversations in sequence.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
conversations: List of dicts with at least {"text": str}.
|
|
37
|
+
Optional: {"text": str, "id": str, "context": dict}
|
|
38
|
+
schema_name: Schema to use for all analyses.
|
|
39
|
+
provider: LLM provider.
|
|
40
|
+
model: LLM model override.
|
|
41
|
+
enable_embeddings: Whether to generate embeddings.
|
|
42
|
+
sink: If provided, write each result immediately after analysis.
|
|
43
|
+
Supports QdrantSink.store() or JsonSink.write() / StdoutSink.write().
|
|
44
|
+
on_progress: Callback (current: int, total: int, result | None) -> None
|
|
45
|
+
"""
|
|
46
|
+
from analyxa.analyzer import Analyzer
|
|
47
|
+
|
|
48
|
+
start = time.monotonic()
|
|
49
|
+
total = len(conversations)
|
|
50
|
+
successful = 0
|
|
51
|
+
failed = 0
|
|
52
|
+
results = []
|
|
53
|
+
errors = []
|
|
54
|
+
|
|
55
|
+
# Create a single Analyzer — reuses schema cache across all analyses
|
|
56
|
+
analyzer = Analyzer(
|
|
57
|
+
schema_name=schema_name,
|
|
58
|
+
provider=provider,
|
|
59
|
+
model=model,
|
|
60
|
+
enable_embeddings=enable_embeddings,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
for i, conv in enumerate(conversations, 1):
|
|
64
|
+
text = conv.get("text", "")
|
|
65
|
+
conv_id = conv.get("id", str(i))
|
|
66
|
+
context = conv.get("context")
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
result = analyzer.analyze(text, context=context)
|
|
70
|
+
results.append(result)
|
|
71
|
+
successful += 1
|
|
72
|
+
|
|
73
|
+
if sink is not None:
|
|
74
|
+
try:
|
|
75
|
+
if hasattr(sink, "store"):
|
|
76
|
+
sink.store(result)
|
|
77
|
+
elif hasattr(sink, "write"):
|
|
78
|
+
sink.write(result.to_flat_dict())
|
|
79
|
+
except Exception:
|
|
80
|
+
pass # Sink errors don't abort the batch
|
|
81
|
+
|
|
82
|
+
except Exception as exc:
|
|
83
|
+
failed += 1
|
|
84
|
+
errors.append({"id": conv_id, "error": f"{type(exc).__name__}: {exc}"})
|
|
85
|
+
results.append(None)
|
|
86
|
+
|
|
87
|
+
if on_progress is not None:
|
|
88
|
+
on_progress(i, total, results[-1])
|
|
89
|
+
|
|
90
|
+
elapsed = time.monotonic() - start
|
|
91
|
+
return BatchResult(
|
|
92
|
+
total=total,
|
|
93
|
+
successful=successful,
|
|
94
|
+
failed=failed,
|
|
95
|
+
results=[r for r in results if r is not None],
|
|
96
|
+
errors=errors,
|
|
97
|
+
elapsed_seconds=elapsed,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def batch_analyze_from_redis(
|
|
102
|
+
redis_source=None,
|
|
103
|
+
qdrant_sink=None,
|
|
104
|
+
max_items: int | None = None,
|
|
105
|
+
provider: str = "anthropic",
|
|
106
|
+
model: str | None = None,
|
|
107
|
+
enable_embeddings: bool = True,
|
|
108
|
+
on_progress: Callable | None = None,
|
|
109
|
+
) -> BatchResult:
|
|
110
|
+
"""Process pending conversations from Redis queue and store in Qdrant.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
redis_source: RedisSource instance. Created with defaults if None.
|
|
114
|
+
qdrant_sink: QdrantSink instance. Created with defaults if None.
|
|
115
|
+
max_items: Max conversations to process. None = all pending.
|
|
116
|
+
provider: LLM provider.
|
|
117
|
+
model: LLM model override.
|
|
118
|
+
enable_embeddings: Whether to generate embeddings.
|
|
119
|
+
on_progress: Callback (current: int, total: int, result | None) -> None
|
|
120
|
+
"""
|
|
121
|
+
from analyxa.sources.redis_source import RedisSource
|
|
122
|
+
from analyxa.sinks.qdrant_sink import QdrantSink
|
|
123
|
+
from analyxa.analyzer import Analyzer
|
|
124
|
+
|
|
125
|
+
if redis_source is None:
|
|
126
|
+
redis_source = RedisSource()
|
|
127
|
+
if qdrant_sink is None:
|
|
128
|
+
qdrant_sink = QdrantSink()
|
|
129
|
+
|
|
130
|
+
# Gather pending conversations
|
|
131
|
+
pending_ids = redis_source.pending()
|
|
132
|
+
if max_items is not None:
|
|
133
|
+
pending_ids = pending_ids[:max_items]
|
|
134
|
+
|
|
135
|
+
total = len(pending_ids)
|
|
136
|
+
start = time.monotonic()
|
|
137
|
+
successful = 0
|
|
138
|
+
failed = 0
|
|
139
|
+
results = []
|
|
140
|
+
errors = []
|
|
141
|
+
|
|
142
|
+
for i, conv_id in enumerate(pending_ids, 1):
|
|
143
|
+
conv_data = redis_source.get(conv_id)
|
|
144
|
+
if conv_data is None:
|
|
145
|
+
failed += 1
|
|
146
|
+
errors.append({"id": conv_id, "error": "Conversation not found in Redis"})
|
|
147
|
+
if on_progress is not None:
|
|
148
|
+
on_progress(i, total, None)
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
text = conv_data.get("text", "")
|
|
152
|
+
schema = conv_data.get("schema", "universal")
|
|
153
|
+
context = conv_data.get("context")
|
|
154
|
+
|
|
155
|
+
try:
|
|
156
|
+
analyzer = Analyzer(
|
|
157
|
+
schema_name=schema,
|
|
158
|
+
provider=provider,
|
|
159
|
+
model=model,
|
|
160
|
+
enable_embeddings=enable_embeddings,
|
|
161
|
+
)
|
|
162
|
+
result = analyzer.analyze(text, context=context)
|
|
163
|
+
redis_source.mark_analyzed(conv_id)
|
|
164
|
+
qdrant_sink.store(result)
|
|
165
|
+
results.append(result)
|
|
166
|
+
successful += 1
|
|
167
|
+
|
|
168
|
+
except Exception as exc:
|
|
169
|
+
error_msg = f"{type(exc).__name__}: {exc}"
|
|
170
|
+
redis_source.mark_failed(conv_id, error_msg)
|
|
171
|
+
failed += 1
|
|
172
|
+
errors.append({"id": conv_id, "error": error_msg})
|
|
173
|
+
|
|
174
|
+
if on_progress is not None:
|
|
175
|
+
on_progress(i, total, results[-1] if results else None)
|
|
176
|
+
|
|
177
|
+
elapsed = time.monotonic() - start
|
|
178
|
+
return BatchResult(
|
|
179
|
+
total=total,
|
|
180
|
+
successful=successful,
|
|
181
|
+
failed=failed,
|
|
182
|
+
results=results,
|
|
183
|
+
errors=errors,
|
|
184
|
+
elapsed_seconds=elapsed,
|
|
185
|
+
)
|