gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. gnosisllm_knowledge/__init__.py +91 -39
  2. gnosisllm_knowledge/api/__init__.py +3 -2
  3. gnosisllm_knowledge/api/knowledge.py +502 -32
  4. gnosisllm_knowledge/api/memory.py +966 -0
  5. gnosisllm_knowledge/backends/__init__.py +14 -5
  6. gnosisllm_knowledge/backends/memory/indexer.py +27 -2
  7. gnosisllm_knowledge/backends/memory/searcher.py +111 -10
  8. gnosisllm_knowledge/backends/opensearch/agentic.py +355 -48
  9. gnosisllm_knowledge/backends/opensearch/config.py +49 -28
  10. gnosisllm_knowledge/backends/opensearch/indexer.py +49 -3
  11. gnosisllm_knowledge/backends/opensearch/mappings.py +14 -5
  12. gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
  13. gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
  14. gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
  15. gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
  16. gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
  17. gnosisllm_knowledge/backends/opensearch/searcher.py +238 -0
  18. gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
  19. gnosisllm_knowledge/cli/app.py +436 -31
  20. gnosisllm_knowledge/cli/commands/agentic.py +26 -9
  21. gnosisllm_knowledge/cli/commands/load.py +169 -19
  22. gnosisllm_knowledge/cli/commands/memory.py +733 -0
  23. gnosisllm_knowledge/cli/commands/search.py +9 -10
  24. gnosisllm_knowledge/cli/commands/setup.py +49 -23
  25. gnosisllm_knowledge/cli/display/service.py +43 -0
  26. gnosisllm_knowledge/cli/utils/config.py +62 -4
  27. gnosisllm_knowledge/core/domain/__init__.py +54 -0
  28. gnosisllm_knowledge/core/domain/discovery.py +166 -0
  29. gnosisllm_knowledge/core/domain/document.py +19 -19
  30. gnosisllm_knowledge/core/domain/memory.py +440 -0
  31. gnosisllm_knowledge/core/domain/result.py +11 -3
  32. gnosisllm_knowledge/core/domain/search.py +12 -25
  33. gnosisllm_knowledge/core/domain/source.py +11 -12
  34. gnosisllm_knowledge/core/events/__init__.py +8 -0
  35. gnosisllm_knowledge/core/events/types.py +198 -5
  36. gnosisllm_knowledge/core/exceptions.py +227 -0
  37. gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
  38. gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
  39. gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
  40. gnosisllm_knowledge/core/interfaces/memory.py +524 -0
  41. gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
  42. gnosisllm_knowledge/core/interfaces/streaming.py +133 -0
  43. gnosisllm_knowledge/core/streaming/__init__.py +36 -0
  44. gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
  45. gnosisllm_knowledge/fetchers/__init__.py +8 -0
  46. gnosisllm_knowledge/fetchers/config.py +27 -0
  47. gnosisllm_knowledge/fetchers/neoreader.py +31 -3
  48. gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
  49. gnosisllm_knowledge/loaders/__init__.py +5 -1
  50. gnosisllm_knowledge/loaders/base.py +3 -4
  51. gnosisllm_knowledge/loaders/discovery.py +338 -0
  52. gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
  53. gnosisllm_knowledge/loaders/factory.py +46 -0
  54. gnosisllm_knowledge/loaders/sitemap.py +129 -1
  55. gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
  56. gnosisllm_knowledge/services/indexing.py +100 -93
  57. gnosisllm_knowledge/services/search.py +84 -31
  58. gnosisllm_knowledge/services/streaming_pipeline.py +334 -0
  59. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +73 -10
  60. gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
  61. gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
  62. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
  63. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -5,6 +5,10 @@ Supports multiple search modes:
5
5
  - keyword: Traditional BM25 text matching
6
6
  - hybrid: Combined semantic + keyword (default, best results)
7
7
  - agentic: AI-powered search with reasoning and answer generation
8
+
9
+ Note:
10
+ This library is tenant-agnostic. Multi-tenancy is achieved through index
11
+ isolation - each tenant should use a separate index (e.g., "knowledge-{account_id}").
8
12
  """
9
13
 
10
14
  from __future__ import annotations
@@ -44,7 +48,6 @@ async def search_command(
44
48
  index_name: str = "knowledge",
45
49
  limit: int = 5,
46
50
  offset: int = 0,
47
- account_id: str | None = None,
48
51
  collection_ids: str | None = None,
49
52
  source_ids: str | None = None,
50
53
  min_score: float = 0.0,
@@ -55,14 +58,17 @@ async def search_command(
55
58
  ) -> None:
56
59
  """Execute the search command.
57
60
 
61
+ Note:
62
+ Multi-tenancy is achieved through index isolation. Use tenant-specific
63
+ index names instead (e.g., --index knowledge-tenant-123).
64
+
58
65
  Args:
59
66
  display: Display service for output.
60
67
  query: Search query text.
61
68
  mode: Search mode (semantic, keyword, hybrid, agentic).
62
- index_name: Index to search.
69
+ index_name: Index to search (use tenant-specific name for isolation).
63
70
  limit: Maximum results to return.
64
71
  offset: Pagination offset.
65
- account_id: Filter by account ID.
66
72
  collection_ids: Filter by collection IDs (comma-separated).
67
73
  source_ids: Filter by source IDs (comma-separated).
68
74
  min_score: Minimum score threshold.
@@ -86,7 +92,6 @@ async def search_command(
86
92
  query=query or "",
87
93
  index_name=index_name,
88
94
  agent_type="flow", # Default to flow for single queries
89
- account_id=account_id,
90
95
  collection_ids=collection_ids,
91
96
  source_ids=source_ids,
92
97
  limit=limit,
@@ -117,7 +122,6 @@ async def search_command(
117
122
  index_name=index_name,
118
123
  mode=mode,
119
124
  limit=limit,
120
- account_id=account_id,
121
125
  collection_ids=collection_ids,
122
126
  source_ids=source_ids,
123
127
  min_score=min_score,
@@ -146,7 +150,6 @@ async def search_command(
146
150
  index_name=index_name,
147
151
  limit=limit,
148
152
  offset=offset,
149
- account_id=account_id,
150
153
  collection_ids=collection_ids,
151
154
  source_ids=source_ids,
152
155
  min_score=min_score,
@@ -167,7 +170,6 @@ async def _execute_search(
167
170
  index_name: str,
168
171
  limit: int,
169
172
  offset: int,
170
- account_id: str | None,
171
173
  collection_ids: str | None,
172
174
  source_ids: str | None,
173
175
  min_score: float,
@@ -214,7 +216,6 @@ async def _execute_search(
214
216
  mode=_get_search_mode(mode),
215
217
  limit=limit,
216
218
  offset=offset,
217
- account_id=account_id,
218
219
  collection_ids=collection_list,
219
220
  source_ids=source_list,
220
221
  min_score=min_score,
@@ -315,7 +316,6 @@ async def _interactive_search(
315
316
  index_name: str,
316
317
  mode: str,
317
318
  limit: int,
318
- account_id: str | None,
319
319
  collection_ids: str | None,
320
320
  source_ids: str | None,
321
321
  min_score: float,
@@ -396,7 +396,6 @@ async def _interactive_search(
396
396
  mode=_get_search_mode(mode),
397
397
  limit=limit,
398
398
  offset=0,
399
- account_id=account_id,
400
399
  collection_ids=collection_list,
401
400
  source_ids=source_list,
402
401
  min_score=min_score,
@@ -26,12 +26,12 @@ if TYPE_CHECKING:
26
26
 
27
27
  async def setup_command(
28
28
  display: RichDisplayService,
29
- host: str = "localhost",
30
- port: int = 9200,
29
+ host: str | None = None,
30
+ port: int | None = None,
31
31
  username: str | None = None,
32
32
  password: str | None = None,
33
- use_ssl: bool = False,
34
- verify_certs: bool = False,
33
+ use_ssl: bool | None = None,
34
+ verify_certs: bool | None = None,
35
35
  force: bool = False,
36
36
  no_sample_data: bool = False,
37
37
  no_hybrid: bool = False,
@@ -40,24 +40,26 @@ async def setup_command(
40
40
 
41
41
  Args:
42
42
  display: Display service for output.
43
- host: OpenSearch host.
44
- port: OpenSearch port.
45
- username: OpenSearch username.
46
- password: OpenSearch password.
47
- use_ssl: Enable SSL.
48
- verify_certs: Verify SSL certificates.
43
+ host: OpenSearch host (overrides env).
44
+ port: OpenSearch port (overrides env).
45
+ username: OpenSearch username (overrides env).
46
+ password: OpenSearch password (overrides env).
47
+ use_ssl: Enable SSL (overrides env).
48
+ verify_certs: Verify SSL certificates (overrides env).
49
49
  force: Clean up existing resources first.
50
50
  no_sample_data: Skip sample data ingestion.
51
51
  no_hybrid: Skip hybrid search pipeline.
52
52
  """
53
- # Load configuration
53
+ # Load configuration from environment
54
54
  cli_config = CliConfig.from_env()
55
55
 
56
- # Override with CLI arguments
57
- final_host = host or cli_config.opensearch_host
58
- final_port = port or cli_config.opensearch_port
59
- final_username = username or cli_config.opensearch_username
60
- final_password = password or cli_config.opensearch_password
56
+ # CLI arguments override environment variables (only if explicitly provided)
57
+ final_host = host if host is not None else cli_config.opensearch_host
58
+ final_port = port if port is not None else cli_config.opensearch_port
59
+ final_username = username if username is not None else cli_config.opensearch_username
60
+ final_password = password if password is not None else cli_config.opensearch_password
61
+ final_use_ssl = use_ssl if use_ssl is not None else cli_config.opensearch_use_ssl
62
+ final_verify_certs = verify_certs if verify_certs is not None else cli_config.opensearch_verify_certs
61
63
 
62
64
  # Validate required config
63
65
  if not cli_config.openai_api_key:
@@ -79,7 +81,7 @@ async def setup_command(
79
81
  "Configuration",
80
82
  [
81
83
  ("Host", f"{final_host}:{final_port}"),
82
- ("SSL", "Enabled" if use_ssl else "Disabled"),
84
+ ("SSL", "Enabled" if final_use_ssl else "Disabled"),
83
85
  ("Auth", "Configured" if final_username else "None"),
84
86
  ("Hybrid Search", "Disabled" if no_hybrid else "Enabled"),
85
87
  ("Force Recreate", "Yes" if force else "No"),
@@ -88,17 +90,41 @@ async def setup_command(
88
90
 
89
91
  display.newline()
90
92
 
91
- # Create OpenSearch config
93
+ # Create OpenSearch config from environment, then override with CLI args
94
+ # This ensures all env vars (including pipeline names) are respected
95
+ base_config = OpenSearchConfig.from_env()
92
96
  opensearch_config = OpenSearchConfig(
97
+ # CLI overrides (if provided)
93
98
  host=final_host,
94
99
  port=final_port,
95
100
  username=final_username,
96
101
  password=final_password,
97
- use_ssl=use_ssl,
98
- verify_certs=verify_certs,
102
+ use_ssl=final_use_ssl,
103
+ verify_certs=final_verify_certs,
99
104
  openai_api_key=cli_config.openai_api_key,
100
105
  embedding_model=cli_config.openai_embedding_model,
101
106
  embedding_dimension=cli_config.openai_embedding_dimension,
107
+ # Preserve env-based config for pipelines and other settings
108
+ ingest_pipeline_name=base_config.ingest_pipeline_name,
109
+ search_pipeline_name=base_config.search_pipeline_name,
110
+ index_prefix=base_config.index_prefix,
111
+ model_id=base_config.model_id,
112
+ model_group_id=base_config.model_group_id,
113
+ embedding_field=base_config.embedding_field,
114
+ # k-NN settings
115
+ knn_engine=base_config.knn_engine,
116
+ knn_space_type=base_config.knn_space_type,
117
+ knn_algo_param_ef_search=base_config.knn_algo_param_ef_search,
118
+ knn_algo_param_ef_construction=base_config.knn_algo_param_ef_construction,
119
+ knn_algo_param_m=base_config.knn_algo_param_m,
120
+ # Index settings
121
+ number_of_shards=base_config.number_of_shards,
122
+ number_of_replicas=base_config.number_of_replicas,
123
+ refresh_interval=base_config.refresh_interval,
124
+ # Agentic settings
125
+ agentic_llm_model=base_config.agentic_llm_model,
126
+ agentic_max_iterations=base_config.agentic_max_iterations,
127
+ agentic_timeout_seconds=base_config.agentic_timeout_seconds,
102
128
  )
103
129
 
104
130
  # Create OpenSearch client
@@ -109,8 +135,8 @@ async def setup_command(
109
135
  client = AsyncOpenSearch(
110
136
  hosts=[{"host": final_host, "port": final_port}],
111
137
  http_auth=http_auth,
112
- use_ssl=use_ssl,
113
- verify_certs=verify_certs,
138
+ use_ssl=final_use_ssl,
139
+ verify_certs=final_verify_certs,
114
140
  ssl_show_warn=False,
115
141
  )
116
142
 
@@ -124,7 +150,7 @@ async def setup_command(
124
150
  display.format_error_with_suggestion(
125
151
  error=f"Cannot connect to OpenSearch at {final_host}:{final_port}",
126
152
  suggestion="Ensure OpenSearch is running and accessible.",
127
- command=f"curl http{'s' if use_ssl else ''}://{final_host}:{final_port}",
153
+ command=f"curl http{'s' if final_use_ssl else ''}://{final_host}:{final_port}",
128
154
  )
129
155
  sys.exit(1)
130
156
 
@@ -553,3 +553,46 @@ class RichDisplayService:
553
553
  suggestion="Run agentic setup to create agents.",
554
554
  command="gnosisllm-knowledge agentic setup",
555
555
  )
556
+
557
+ def memory_status(
558
+ self,
559
+ llm_model_id: str | None,
560
+ embedding_model_id: str | None,
561
+ llm_model: str = "gpt-4o",
562
+ embedding_model: str = "text-embedding-3-small",
563
+ ) -> None:
564
+ """Display agentic memory configuration status.
565
+
566
+ Args:
567
+ llm_model_id: LLM model ID if configured.
568
+ embedding_model_id: Embedding model ID if configured.
569
+ llm_model: LLM model name for fact extraction.
570
+ embedding_model: Embedding model name.
571
+ """
572
+ status_rows = []
573
+
574
+ # LLM Model
575
+ if llm_model_id:
576
+ status_rows.append(("LLM Model", "[green]Configured[/green]"))
577
+ status_rows.append((" ID", f"[dim]{llm_model_id}[/dim]"))
578
+ status_rows.append((" Model", llm_model))
579
+ else:
580
+ status_rows.append(("LLM Model", "[red]Not configured[/red]"))
581
+
582
+ # Embedding Model
583
+ if embedding_model_id:
584
+ status_rows.append(("Embedding Model", "[green]Configured[/green]"))
585
+ status_rows.append((" ID", f"[dim]{embedding_model_id}[/dim]"))
586
+ status_rows.append((" Model", embedding_model))
587
+ else:
588
+ status_rows.append(("Embedding Model", "[red]Not configured[/red]"))
589
+
590
+ self.table("Agentic Memory Configuration", status_rows)
591
+
592
+ if not llm_model_id or not embedding_model_id:
593
+ self.newline()
594
+ self.format_error_with_suggestion(
595
+ error="Memory models not configured.",
596
+ suggestion="Run memory setup to create connectors and models.",
597
+ command="gnosisllm-knowledge memory setup --openai-key sk-...",
598
+ )
@@ -27,7 +27,7 @@ class CliConfig:
27
27
  opensearch_verify_certs: bool = False
28
28
  opensearch_model_id: str | None = None
29
29
  opensearch_index_name: str = "knowledge"
30
- opensearch_pipeline_name: str = "gnosisllm-ingest-pipeline"
30
+ opensearch_ingest_pipeline_name: str = "gnosisllm-ingest-pipeline"
31
31
  opensearch_search_pipeline_name: str = "gnosisllm-search-pipeline"
32
32
 
33
33
  # OpenAI
@@ -42,6 +42,13 @@ class CliConfig:
42
42
  agentic_max_iterations: int = 5
43
43
  agentic_timeout_seconds: int = 60
44
44
 
45
+ # Agentic Memory
46
+ memory_llm_model_id: str | None = None
47
+ memory_embedding_model_id: str | None = None
48
+ memory_llm_model: str = "gpt-4o"
49
+ memory_embedding_model: str = "text-embedding-3-small"
50
+ memory_embedding_dimension: int = 1536
51
+
45
52
  # Neoreader
46
53
  neoreader_host: str = "https://api.neoreader.dev"
47
54
 
@@ -71,11 +78,11 @@ class CliConfig:
71
78
  == "true",
72
79
  opensearch_model_id=os.getenv("OPENSEARCH_MODEL_ID"),
73
80
  opensearch_index_name=os.getenv("OPENSEARCH_INDEX_NAME", "knowledge"),
74
- opensearch_pipeline_name=os.getenv(
75
- "OPENSEARCH_PIPELINE_NAME", "gnosisllm-ingest-pipeline"
81
+ opensearch_ingest_pipeline_name=os.getenv(
82
+ "OPENSEARCH_INGEST_PIPELINE", "gnosisllm-ingest-pipeline"
76
83
  ),
77
84
  opensearch_search_pipeline_name=os.getenv(
78
- "OPENSEARCH_SEARCH_PIPELINE_NAME", "gnosisllm-search-pipeline"
85
+ "OPENSEARCH_SEARCH_PIPELINE", "gnosisllm-search-pipeline"
79
86
  ),
80
87
  openai_api_key=os.getenv("OPENAI_API_KEY"),
81
88
  openai_embedding_model=os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002"),
@@ -86,6 +93,12 @@ class CliConfig:
86
93
  agentic_llm_model=os.getenv("AGENTIC_LLM_MODEL", "gpt-4o"),
87
94
  agentic_max_iterations=int(os.getenv("AGENTIC_MAX_ITERATIONS", "5")),
88
95
  agentic_timeout_seconds=int(os.getenv("AGENTIC_TIMEOUT_SECONDS", "60")),
96
+ # Agentic Memory configuration
97
+ memory_llm_model_id=os.getenv("OPENSEARCH_MEMORY_LLM_MODEL_ID"),
98
+ memory_embedding_model_id=os.getenv("OPENSEARCH_MEMORY_EMBEDDING_MODEL_ID"),
99
+ memory_llm_model=os.getenv("MEMORY_LLM_MODEL", "gpt-4o"),
100
+ memory_embedding_model=os.getenv("MEMORY_EMBEDDING_MODEL", "text-embedding-3-small"),
101
+ memory_embedding_dimension=int(os.getenv("MEMORY_EMBEDDING_DIMENSION", "1536")),
89
102
  neoreader_host=os.getenv("NEOREADER_HOST", "https://api.neoreader.dev"),
90
103
  )
91
104
 
@@ -205,3 +218,48 @@ class CliConfig:
205
218
  def has_conversational_agent(self) -> bool:
206
219
  """Check if conversational agent is configured."""
207
220
  return bool(self.opensearch_conversational_agent_id)
221
+
222
+ # === Memory Configuration ===
223
+
224
+ def validate_for_memory(self) -> list[str]:
225
+ """Validate configuration for memory commands.
226
+
227
+ Returns:
228
+ List of validation errors (empty if valid).
229
+ """
230
+ errors = []
231
+ if not self.memory_llm_model_id:
232
+ errors.append(
233
+ "OPENSEARCH_MEMORY_LLM_MODEL_ID is required for memory operations. "
234
+ "Run 'gnosisllm-knowledge memory setup' first."
235
+ )
236
+ if not self.memory_embedding_model_id:
237
+ errors.append(
238
+ "OPENSEARCH_MEMORY_EMBEDDING_MODEL_ID is required for memory operations. "
239
+ "Run 'gnosisllm-knowledge memory setup' first."
240
+ )
241
+ return errors
242
+
243
+ def validate_for_memory_setup(self) -> list[str]:
244
+ """Validate configuration for memory setup command.
245
+
246
+ Returns:
247
+ List of validation errors (empty if valid).
248
+ """
249
+ errors = []
250
+ if not self.openai_api_key:
251
+ errors.append(
252
+ "OPENAI_API_KEY is required for memory setup. "
253
+ "Use --openai-key or set the environment variable."
254
+ )
255
+ return errors
256
+
257
+ @property
258
+ def has_memory_models(self) -> bool:
259
+ """Check if memory models are configured."""
260
+ return bool(self.memory_llm_model_id and self.memory_embedding_model_id)
261
+
262
+ @property
263
+ def memory_is_configured(self) -> bool:
264
+ """Check if memory is fully configured for operations."""
265
+ return self.has_memory_models
@@ -1,6 +1,34 @@
1
1
  """Domain models - Value objects and entities."""
2
2
 
3
+ from gnosisllm_knowledge.core.domain.discovery import (
4
+ DiscoveredURL,
5
+ DiscoveryConfig,
6
+ DiscoveryJobStatus,
7
+ DiscoveryProgress,
8
+ DiscoveryStats,
9
+ )
3
10
  from gnosisllm_knowledge.core.domain.document import Document, DocumentStatus, TextChunk
11
+ from gnosisllm_knowledge.core.domain.memory import (
12
+ ContainerConfig,
13
+ ContainerIndexSettings,
14
+ ContainerInfo,
15
+ EmbeddingModelType,
16
+ HistoryAction,
17
+ HistoryEntry,
18
+ IndexSettings,
19
+ MemoryEntry,
20
+ MemoryStats,
21
+ MemoryStrategy,
22
+ MemoryType,
23
+ Message,
24
+ Namespace,
25
+ PayloadType,
26
+ RecallResult,
27
+ SessionInfo,
28
+ StoreRequest,
29
+ StoreResult,
30
+ StrategyConfig,
31
+ )
4
32
  from gnosisllm_knowledge.core.domain.result import (
5
33
  BatchResult,
6
34
  IndexResult,
@@ -20,10 +48,36 @@ from gnosisllm_knowledge.core.domain.search import (
20
48
  from gnosisllm_knowledge.core.domain.source import SourceConfig
21
49
 
22
50
  __all__ = [
51
+ # Discovery
52
+ "DiscoveredURL",
53
+ "DiscoveryConfig",
54
+ "DiscoveryJobStatus",
55
+ "DiscoveryProgress",
56
+ "DiscoveryStats",
23
57
  # Document
24
58
  "Document",
25
59
  "DocumentStatus",
26
60
  "TextChunk",
61
+ # Memory
62
+ "MemoryStrategy",
63
+ "MemoryType",
64
+ "PayloadType",
65
+ "EmbeddingModelType",
66
+ "HistoryAction",
67
+ "StrategyConfig",
68
+ "IndexSettings",
69
+ "ContainerIndexSettings",
70
+ "ContainerConfig",
71
+ "ContainerInfo",
72
+ "Message",
73
+ "Namespace",
74
+ "StoreRequest",
75
+ "StoreResult",
76
+ "MemoryEntry",
77
+ "RecallResult",
78
+ "SessionInfo",
79
+ "HistoryEntry",
80
+ "MemoryStats",
27
81
  # Result
28
82
  "LoadResult",
29
83
  "IndexResult",
@@ -0,0 +1,166 @@
1
+ """Domain models for website discovery."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+
7
+
8
+ @dataclass
9
+ class DiscoveryConfig:
10
+ """Configuration for website discovery crawl.
11
+
12
+ Controls how the Neo Reader Discovery API crawls and discovers URLs.
13
+
14
+ Attributes:
15
+ max_depth: Maximum crawl depth from start URL.
16
+ max_pages: Maximum number of pages to crawl.
17
+ same_domain: Only crawl URLs on the same domain.
18
+ include_subdomains: Include subdomains when same_domain is True.
19
+ respect_robots: Respect robots.txt rules.
20
+ parse_sitemap: Also parse sitemap if available.
21
+ with_metadata: Include page metadata (title, etc.) in results.
22
+ crawl_timeout: Overall timeout for the crawl in seconds.
23
+ concurrent_requests: Number of concurrent crawl requests.
24
+ request_delay: Delay between requests in milliseconds.
25
+ include_pattern: Regex pattern for URLs to include.
26
+ exclude_pattern: Regex pattern for URLs to exclude.
27
+ path_prefix: Only crawl URLs with this path prefix.
28
+ """
29
+
30
+ max_depth: int = 3
31
+ max_pages: int = 100
32
+ same_domain: bool = True
33
+ include_subdomains: bool = True
34
+ respect_robots: bool = True
35
+ parse_sitemap: bool = False
36
+ with_metadata: bool = True
37
+ crawl_timeout: int = 300
38
+ concurrent_requests: int = 5
39
+ request_delay: int = 100
40
+ include_pattern: str | None = None
41
+ exclude_pattern: str | None = None
42
+ path_prefix: str | None = None
43
+
44
+ def to_headers(self) -> dict[str, str]:
45
+ """Convert config to HTTP headers for Neo Reader API.
46
+
47
+ Returns:
48
+ Dictionary of header name to value.
49
+ """
50
+ headers = {
51
+ "X-Max-Depth": str(self.max_depth),
52
+ "X-Max-Pages": str(self.max_pages),
53
+ "X-Same-Domain": str(self.same_domain).lower(),
54
+ "X-Include-Subdomains": str(self.include_subdomains).lower(),
55
+ "X-Respect-Robots": str(self.respect_robots).lower(),
56
+ "X-Parse-Sitemap": str(self.parse_sitemap).lower(),
57
+ "X-With-Metadata": str(self.with_metadata).lower(),
58
+ "X-Crawl-Timeout": str(self.crawl_timeout),
59
+ "X-Concurrent-Requests": str(self.concurrent_requests),
60
+ "X-Request-Delay": str(self.request_delay),
61
+ }
62
+ if self.include_pattern:
63
+ headers["X-Include-Pattern"] = self.include_pattern
64
+ if self.exclude_pattern:
65
+ headers["X-Exclude-Pattern"] = self.exclude_pattern
66
+ if self.path_prefix:
67
+ headers["X-Path-Prefix"] = self.path_prefix
68
+ return headers
69
+
70
+
71
+ @dataclass
72
+ class DiscoveryProgress:
73
+ """Progress information for a running discovery job.
74
+
75
+ Attributes:
76
+ percent: Completion percentage (0-100).
77
+ pages_crawled: Number of pages crawled so far.
78
+ urls_discovered: Number of URLs discovered so far.
79
+ current_depth: Current crawl depth.
80
+ message: Human-readable progress message.
81
+ """
82
+
83
+ percent: int = 0
84
+ pages_crawled: int = 0
85
+ urls_discovered: int = 0
86
+ current_depth: int = 0
87
+ message: str = ""
88
+
89
+
90
+ @dataclass
91
+ class DiscoveryStats:
92
+ """Statistics for a completed discovery job.
93
+
94
+ Attributes:
95
+ pages_crawled: Total pages crawled.
96
+ urls_found: Total URLs found during crawl.
97
+ urls_returned: URLs returned in results (after filtering).
98
+ urls_filtered: URLs excluded by filters.
99
+ errors: Number of errors during crawl.
100
+ duration_seconds: Total crawl duration.
101
+ """
102
+
103
+ pages_crawled: int = 0
104
+ urls_found: int = 0
105
+ urls_returned: int = 0
106
+ urls_filtered: int = 0
107
+ errors: int = 0
108
+ duration_seconds: float = 0.0
109
+
110
+
111
+ @dataclass
112
+ class DiscoveredURL:
113
+ """A URL discovered during crawl.
114
+
115
+ Attributes:
116
+ url: The discovered URL.
117
+ depth: Crawl depth at which URL was found.
118
+ title: Page title if available.
119
+ is_internal: Whether URL is internal to the domain.
120
+ """
121
+
122
+ url: str
123
+ depth: int = 0
124
+ title: str | None = None
125
+ is_internal: bool = True
126
+
127
+
128
+ @dataclass
129
+ class DiscoveryJobStatus:
130
+ """Status of a discovery job.
131
+
132
+ Represents the current state of an async discovery job.
133
+
134
+ Attributes:
135
+ job_id: Unique job identifier.
136
+ status: Job status (pending, queued, running, completed, failed, cancelled).
137
+ start_url: The URL that started the discovery.
138
+ progress: Progress information if job is running.
139
+ stats: Statistics if job is completed.
140
+ urls: Discovered URLs if job is completed.
141
+ error: Error message if job failed.
142
+ """
143
+
144
+ job_id: str
145
+ status: str
146
+ start_url: str
147
+ progress: DiscoveryProgress | None = None
148
+ stats: DiscoveryStats | None = None
149
+ urls: list[DiscoveredURL] = field(default_factory=list)
150
+ error: str | None = None
151
+
152
+ def is_terminal(self) -> bool:
153
+ """Check if job is in a terminal state.
154
+
155
+ Returns:
156
+ True if job is completed, failed, or cancelled.
157
+ """
158
+ return self.status in ("completed", "failed", "cancelled")
159
+
160
+ def is_running(self) -> bool:
161
+ """Check if job is currently running.
162
+
163
+ Returns:
164
+ True if job is pending, queued, or running.
165
+ """
166
+ return self.status in ("pending", "queued", "running")