analyxa 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
analyxa/__init__.py ADDED
@@ -0,0 +1,8 @@
1
+ """Analyxa — Multi-dimensional extraction engine for AI conversations."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ from analyxa.analyzer import Analyzer, AnalysisResult, analyze
6
+ from analyxa.schema import SchemaManager
7
+
8
+ __all__ = ["Analyzer", "AnalysisResult", "analyze", "SchemaManager", "__version__"]
analyxa/analyzer.py ADDED
@@ -0,0 +1,183 @@
1
+ """Analyzer — main pipeline orchestrator for Analyxa."""
2
+
3
+ import hashlib
4
+ from dataclasses import dataclass
5
+ from datetime import datetime, timezone
6
+
7
+ from analyxa.schema import SchemaManager
8
+ from analyxa.prompt_builder import build_prompt
9
+ from analyxa.llm_client import LLMClient, LLMResponse
10
+ from analyxa.embeddings import EmbeddingGenerator
11
+
12
+
13
+ @dataclass
14
+ class AnalysisResult:
15
+ """Full result of a conversation analysis run."""
16
+
17
+ # Fields extracted by the LLM
18
+ fields: dict
19
+
20
+ # Schema metadata
21
+ schema_name: str
22
+ schema_version: str
23
+
24
+ # Auto-computed metadata
25
+ analyzed_at: str # ISO 8601 UTC timestamp
26
+ analysis_model: str # LLM model used
27
+ embedding_model: str | None
28
+ session_length: int # Approximate line count
29
+ conversation_hash: str # SHA-256 truncated to 16 hex chars
30
+
31
+ # Semantic vector
32
+ embedding: list[float] | None
33
+
34
+ # Execution metadata
35
+ llm_response: LLMResponse
36
+ validation_errors: list[str]
37
+
38
+ def to_dict(self) -> dict:
39
+ """Full dict with fields + nested _meta (for Qdrant/storage)."""
40
+ result = {}
41
+ result.update(self.fields)
42
+ result["_meta"] = {
43
+ "schema_name": self.schema_name,
44
+ "schema_version": self.schema_version,
45
+ "analyzed_at": self.analyzed_at,
46
+ "analysis_model": self.analysis_model,
47
+ "embedding_model": self.embedding_model,
48
+ "session_length": self.session_length,
49
+ "conversation_hash": self.conversation_hash,
50
+ "has_embedding": self.embedding is not None,
51
+ "input_tokens": self.llm_response.input_tokens,
52
+ "output_tokens": self.llm_response.output_tokens,
53
+ "latency_ms": self.llm_response.latency_ms,
54
+ "validation_errors": self.validation_errors,
55
+ }
56
+ return result
57
+
58
+ def to_flat_dict(self) -> dict:
59
+ """Flat dict with fields + meta at same level, no embedding (for JSON/CSV export)."""
60
+ result = {}
61
+ result.update(self.fields)
62
+ result["schema_name"] = self.schema_name
63
+ result["schema_version"] = self.schema_version
64
+ result["analyzed_at"] = self.analyzed_at
65
+ result["analysis_model"] = self.analysis_model
66
+ result["session_length"] = self.session_length
67
+ result["conversation_hash"] = self.conversation_hash
68
+ return result
69
+
70
+
71
+ class Analyzer:
72
+ """Orchestrates the full analysis pipeline.
73
+
74
+ Pipeline:
75
+ conversation → build_prompt → LLMClient → validate → embed → AnalysisResult
76
+ """
77
+
78
+ def __init__(
79
+ self,
80
+ schema_name: str = "universal",
81
+ provider: str = "anthropic",
82
+ model: str | None = None,
83
+ api_key: str | None = None,
84
+ enable_embeddings: bool = True,
85
+ embedding_api_key: str | None = None,
86
+ ) -> None:
87
+ self.schema_name = schema_name
88
+ self.schema_manager = SchemaManager()
89
+ self.schema = self.schema_manager.load_schema(schema_name)
90
+ self.llm_client = LLMClient(provider=provider, model=model, api_key=api_key)
91
+
92
+ if enable_embeddings:
93
+ self.embedding_generator: EmbeddingGenerator | None = EmbeddingGenerator(
94
+ api_key=embedding_api_key
95
+ )
96
+ else:
97
+ self.embedding_generator = None
98
+
99
+ def analyze(
100
+ self, conversation: str, context: dict | None = None
101
+ ) -> AnalysisResult:
102
+ """Run the full pipeline on a conversation string."""
103
+ # Step 1 — build prompt
104
+ prompt = build_prompt(self.schema, conversation, context)
105
+
106
+ # Step 2 — call LLM
107
+ llm_response = self.llm_client.analyze(prompt)
108
+
109
+ # Compute auto-fields (used regardless of LLM success)
110
+ analyzed_at = datetime.now(timezone.utc).isoformat()
111
+ session_length = conversation.count("\n") + 1
112
+ conversation_hash = hashlib.sha256(conversation.encode()).hexdigest()[:16]
113
+
114
+ # Step 3 — handle LLM failure
115
+ if not llm_response.success or llm_response.parsed_json is None:
116
+ return AnalysisResult(
117
+ fields={},
118
+ schema_name=self.schema_name,
119
+ schema_version=self.schema["metadata"]["version"],
120
+ analyzed_at=analyzed_at,
121
+ analysis_model=self.llm_client.model,
122
+ embedding_model=None,
123
+ session_length=session_length,
124
+ conversation_hash=conversation_hash,
125
+ embedding=None,
126
+ llm_response=llm_response,
127
+ validation_errors=[
128
+ f"LLM failed: {llm_response.error or 'Could not parse JSON'}"
129
+ ],
130
+ )
131
+
132
+ # Step 4 — validate against schema
133
+ _is_valid, errors = self.schema_manager.validate_result(
134
+ self.schema_name, llm_response.parsed_json
135
+ )
136
+
137
+ # Step 5 — generate embedding from summary field
138
+ embedding = None
139
+ embedding_model = None
140
+ if self.embedding_generator:
141
+ summary = llm_response.parsed_json.get("summary", "")
142
+ if summary:
143
+ embedding = self.embedding_generator.generate(summary)
144
+ if embedding is not None:
145
+ embedding_model = "text-embedding-3-small"
146
+
147
+ # Step 6 — build and return result
148
+ return AnalysisResult(
149
+ fields=llm_response.parsed_json,
150
+ schema_name=self.schema_name,
151
+ schema_version=self.schema["metadata"]["version"],
152
+ analyzed_at=analyzed_at,
153
+ analysis_model=self.llm_client.model,
154
+ embedding_model=embedding_model,
155
+ session_length=session_length,
156
+ conversation_hash=conversation_hash,
157
+ embedding=embedding,
158
+ llm_response=llm_response,
159
+ validation_errors=errors,
160
+ )
161
+
162
+
163
+ def analyze(
164
+ conversation: str,
165
+ schema: str = "universal",
166
+ provider: str = "anthropic",
167
+ model: str | None = None,
168
+ context: dict | None = None,
169
+ enable_embeddings: bool = True,
170
+ ) -> AnalysisResult:
171
+ """Convenience function for quick one-shot analysis.
172
+
173
+ Usage:
174
+ from analyxa import analyze
175
+ result = analyze("User: hello\\nAgent: hi", schema="support")
176
+ """
177
+ analyzer = Analyzer(
178
+ schema_name=schema,
179
+ provider=provider,
180
+ model=model,
181
+ enable_embeddings=enable_embeddings,
182
+ )
183
+ return analyzer.analyze(conversation, context=context)
analyxa/batch.py ADDED
@@ -0,0 +1,185 @@
1
+ """Batch processor — analyzes multiple conversations in sequence."""
2
+
3
+ import time
4
+ from dataclasses import dataclass, field
5
+ from typing import Any, Callable
6
+
7
+
8
+ @dataclass
9
+ class BatchResult:
10
+ """Result of a batch analysis run."""
11
+
12
+ total: int
13
+ successful: int
14
+ failed: int
15
+ results: list # list[AnalysisResult]
16
+ errors: list[dict] # [{"id": str, "error": str}]
17
+ elapsed_seconds: float
18
+
19
+ @property
20
+ def success_rate(self) -> float:
21
+ return self.successful / self.total if self.total > 0 else 0.0
22
+
23
+
24
+ def batch_analyze(
25
+ conversations: list[dict],
26
+ schema_name: str = "universal",
27
+ provider: str = "anthropic",
28
+ model: str | None = None,
29
+ enable_embeddings: bool = True,
30
+ sink: Any | None = None,
31
+ on_progress: Callable | None = None,
32
+ ) -> BatchResult:
33
+ """Analyze a list of conversations in sequence.
34
+
35
+ Args:
36
+ conversations: List of dicts with at least {"text": str}.
37
+ Optional: {"text": str, "id": str, "context": dict}
38
+ schema_name: Schema to use for all analyses.
39
+ provider: LLM provider.
40
+ model: LLM model override.
41
+ enable_embeddings: Whether to generate embeddings.
42
+ sink: If provided, write each result immediately after analysis.
43
+ Supports QdrantSink.store() or JsonSink.write() / StdoutSink.write().
44
+ on_progress: Callback (current: int, total: int, result | None) -> None
45
+ """
46
+ from analyxa.analyzer import Analyzer
47
+
48
+ start = time.monotonic()
49
+ total = len(conversations)
50
+ successful = 0
51
+ failed = 0
52
+ results = []
53
+ errors = []
54
+
55
+ # Create a single Analyzer — reuses schema cache across all analyses
56
+ analyzer = Analyzer(
57
+ schema_name=schema_name,
58
+ provider=provider,
59
+ model=model,
60
+ enable_embeddings=enable_embeddings,
61
+ )
62
+
63
+ for i, conv in enumerate(conversations, 1):
64
+ text = conv.get("text", "")
65
+ conv_id = conv.get("id", str(i))
66
+ context = conv.get("context")
67
+
68
+ try:
69
+ result = analyzer.analyze(text, context=context)
70
+ results.append(result)
71
+ successful += 1
72
+
73
+ if sink is not None:
74
+ try:
75
+ if hasattr(sink, "store"):
76
+ sink.store(result)
77
+ elif hasattr(sink, "write"):
78
+ sink.write(result.to_flat_dict())
79
+ except Exception:
80
+ pass # Sink errors don't abort the batch
81
+
82
+ except Exception as exc:
83
+ failed += 1
84
+ errors.append({"id": conv_id, "error": f"{type(exc).__name__}: {exc}"})
85
+ results.append(None)
86
+
87
+ if on_progress is not None:
88
+ on_progress(i, total, results[-1])
89
+
90
+ elapsed = time.monotonic() - start
91
+ return BatchResult(
92
+ total=total,
93
+ successful=successful,
94
+ failed=failed,
95
+ results=[r for r in results if r is not None],
96
+ errors=errors,
97
+ elapsed_seconds=elapsed,
98
+ )
99
+
100
+
101
+ def batch_analyze_from_redis(
102
+ redis_source=None,
103
+ qdrant_sink=None,
104
+ max_items: int | None = None,
105
+ provider: str = "anthropic",
106
+ model: str | None = None,
107
+ enable_embeddings: bool = True,
108
+ on_progress: Callable | None = None,
109
+ ) -> BatchResult:
110
+ """Process pending conversations from Redis queue and store in Qdrant.
111
+
112
+ Args:
113
+ redis_source: RedisSource instance. Created with defaults if None.
114
+ qdrant_sink: QdrantSink instance. Created with defaults if None.
115
+ max_items: Max conversations to process. None = all pending.
116
+ provider: LLM provider.
117
+ model: LLM model override.
118
+ enable_embeddings: Whether to generate embeddings.
119
+ on_progress: Callback (current: int, total: int, result | None) -> None
120
+ """
121
+ from analyxa.sources.redis_source import RedisSource
122
+ from analyxa.sinks.qdrant_sink import QdrantSink
123
+ from analyxa.analyzer import Analyzer
124
+
125
+ if redis_source is None:
126
+ redis_source = RedisSource()
127
+ if qdrant_sink is None:
128
+ qdrant_sink = QdrantSink()
129
+
130
+ # Gather pending conversations
131
+ pending_ids = redis_source.pending()
132
+ if max_items is not None:
133
+ pending_ids = pending_ids[:max_items]
134
+
135
+ total = len(pending_ids)
136
+ start = time.monotonic()
137
+ successful = 0
138
+ failed = 0
139
+ results = []
140
+ errors = []
141
+
142
+ for i, conv_id in enumerate(pending_ids, 1):
143
+ conv_data = redis_source.get(conv_id)
144
+ if conv_data is None:
145
+ failed += 1
146
+ errors.append({"id": conv_id, "error": "Conversation not found in Redis"})
147
+ if on_progress is not None:
148
+ on_progress(i, total, None)
149
+ continue
150
+
151
+ text = conv_data.get("text", "")
152
+ schema = conv_data.get("schema", "universal")
153
+ context = conv_data.get("context")
154
+
155
+ try:
156
+ analyzer = Analyzer(
157
+ schema_name=schema,
158
+ provider=provider,
159
+ model=model,
160
+ enable_embeddings=enable_embeddings,
161
+ )
162
+ result = analyzer.analyze(text, context=context)
163
+ redis_source.mark_analyzed(conv_id)
164
+ qdrant_sink.store(result)
165
+ results.append(result)
166
+ successful += 1
167
+
168
+ except Exception as exc:
169
+ error_msg = f"{type(exc).__name__}: {exc}"
170
+ redis_source.mark_failed(conv_id, error_msg)
171
+ failed += 1
172
+ errors.append({"id": conv_id, "error": error_msg})
173
+
174
+ if on_progress is not None:
175
+ on_progress(i, total, results[-1] if results else None)
176
+
177
+ elapsed = time.monotonic() - start
178
+ return BatchResult(
179
+ total=total,
180
+ successful=successful,
181
+ failed=failed,
182
+ results=results,
183
+ errors=errors,
184
+ elapsed_seconds=elapsed,
185
+ )