gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. gnosisllm_knowledge/api/knowledge.py +225 -35
  2. gnosisllm_knowledge/backends/memory/indexer.py +27 -2
  3. gnosisllm_knowledge/backends/memory/searcher.py +111 -10
  4. gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
  5. gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
  6. gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
  7. gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
  8. gnosisllm_knowledge/backends/opensearch/searcher.py +9 -6
  9. gnosisllm_knowledge/cli/app.py +58 -19
  10. gnosisllm_knowledge/cli/commands/agentic.py +15 -9
  11. gnosisllm_knowledge/cli/commands/load.py +169 -19
  12. gnosisllm_knowledge/cli/commands/memory.py +10 -0
  13. gnosisllm_knowledge/cli/commands/search.py +9 -10
  14. gnosisllm_knowledge/cli/commands/setup.py +25 -1
  15. gnosisllm_knowledge/cli/utils/config.py +4 -4
  16. gnosisllm_knowledge/core/domain/__init__.py +13 -0
  17. gnosisllm_knowledge/core/domain/discovery.py +166 -0
  18. gnosisllm_knowledge/core/domain/document.py +14 -19
  19. gnosisllm_knowledge/core/domain/search.py +10 -25
  20. gnosisllm_knowledge/core/domain/source.py +11 -12
  21. gnosisllm_knowledge/core/events/__init__.py +8 -0
  22. gnosisllm_knowledge/core/events/types.py +122 -5
  23. gnosisllm_knowledge/core/exceptions.py +93 -0
  24. gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
  25. gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
  26. gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
  27. gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
  28. gnosisllm_knowledge/fetchers/__init__.py +8 -0
  29. gnosisllm_knowledge/fetchers/config.py +27 -0
  30. gnosisllm_knowledge/fetchers/neoreader.py +31 -3
  31. gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
  32. gnosisllm_knowledge/loaders/__init__.py +5 -1
  33. gnosisllm_knowledge/loaders/discovery.py +338 -0
  34. gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
  35. gnosisllm_knowledge/loaders/factory.py +46 -0
  36. gnosisllm_knowledge/services/indexing.py +35 -20
  37. gnosisllm_knowledge/services/search.py +37 -20
  38. gnosisllm_knowledge/services/streaming_pipeline.py +39 -7
  39. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +30 -10
  40. gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
  41. gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
  42. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
  43. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -1,4 +1,11 @@
1
- """Agentic searcher protocol - Interface for AI-powered search operations."""
1
+ """Agentic searcher protocol - Interface for AI-powered search operations.
2
+
3
+ Note:
4
+ This library is tenant-agnostic. Multi-tenancy is achieved through index
5
+ isolation (e.g., `knowledge-{account_id}`). Agentic searcher implementations
6
+ should not include tenant filtering logic - callers should use tenant-specific
7
+ indices.
8
+ """
2
9
 
3
10
  from __future__ import annotations
4
11
 
@@ -15,6 +22,9 @@ if TYPE_CHECKING:
15
22
  class IAgenticSearcher(Protocol):
16
23
  """Protocol for agentic search operations using AI agents.
17
24
 
25
+ This protocol is tenant-agnostic. Multi-tenancy is achieved through index
26
+ isolation by using tenant-specific index names.
27
+
18
28
  Agentic searchers are responsible for:
19
29
  - Understanding natural language queries
20
30
  - Automatically constructing optimal search strategies
@@ -107,13 +117,11 @@ class IAgenticSearcher(Protocol):
107
117
 
108
118
  async def list_conversations(
109
119
  self,
110
- account_id: str | None = None,
111
120
  limit: int = 100,
112
121
  ) -> list[dict[str, Any]]:
113
122
  """List active conversations.
114
123
 
115
124
  Args:
116
- account_id: Filter by account (multi-tenant).
117
125
  limit: Maximum number of conversations.
118
126
 
119
127
  Returns:
@@ -1,4 +1,10 @@
1
- """Document indexer protocol - Interface Segregation Principle."""
1
+ """Document indexer protocol - Interface Segregation Principle.
2
+
3
+ Note:
4
+ This library is tenant-agnostic. Multi-tenancy is achieved through index
5
+ isolation (e.g., `knowledge-{account_id}`). Indexer implementations should
6
+ not include tenant filtering logic - callers should use tenant-specific indices.
7
+ """
2
8
 
3
9
  from __future__ import annotations
4
10
 
@@ -14,6 +20,9 @@ if TYPE_CHECKING:
14
20
  class IDocumentIndexer(Protocol):
15
21
  """Protocol for indexing documents into a search backend.
16
22
 
23
+ This protocol is tenant-agnostic. Multi-tenancy is achieved through index
24
+ isolation by using tenant-specific index names.
25
+
17
26
  Document indexers are responsible for:
18
27
  - Generating embeddings for documents
19
28
  - Storing documents in the search backend
@@ -1,4 +1,10 @@
1
- """Knowledge searcher protocol - Interface Segregation Principle."""
1
+ """Knowledge searcher protocol - Interface Segregation Principle.
2
+
3
+ Note:
4
+ This library is tenant-agnostic. Multi-tenancy is achieved through index
5
+ isolation (e.g., `knowledge-{account_id}`). Searcher implementations should
6
+ not include tenant filtering logic - callers should use tenant-specific indices.
7
+ """
2
8
 
3
9
  from __future__ import annotations
4
10
 
@@ -12,6 +18,9 @@ if TYPE_CHECKING:
12
18
  class IKnowledgeSearcher(Protocol):
13
19
  """Protocol for searching documents in a search backend.
14
20
 
21
+ This protocol is tenant-agnostic. Multi-tenancy is achieved through index
22
+ isolation by using tenant-specific index names.
23
+
15
24
  Knowledge searchers are responsible for:
16
25
  - Executing different search modes (semantic, keyword, hybrid)
17
26
  - Generating embeddings for queries
@@ -2,6 +2,11 @@
2
2
 
3
3
  These protocols define contracts for streaming operations that process
4
4
  data in bounded batches rather than loading everything into memory.
5
+
6
+ Note:
7
+ This library is tenant-agnostic. Multi-tenancy is achieved through index
8
+ isolation (e.g., `knowledge-{account_id}`). Streaming implementations should
9
+ not include tenant filtering logic - callers should use tenant-specific indices.
5
10
  """
6
11
 
7
12
  from __future__ import annotations
@@ -97,6 +102,9 @@ class IStreamingLoader(Protocol):
97
102
  class IStreamingPipeline(Protocol):
98
103
  """Protocol for streaming indexing pipelines.
99
104
 
105
+ This protocol is tenant-agnostic. Multi-tenancy is achieved through index
106
+ isolation by using tenant-specific index names.
107
+
100
108
  Orchestrates the full streaming load -> index pipeline with
101
109
  bounded memory guarantees.
102
110
  """
@@ -106,7 +114,6 @@ class IStreamingPipeline(Protocol):
106
114
  source: str,
107
115
  index_name: str,
108
116
  *,
109
- account_id: str | None = None,
110
117
  collection_id: str | None = None,
111
118
  source_id: str | None = None,
112
119
  **options: Any,
@@ -115,9 +122,8 @@ class IStreamingPipeline(Protocol):
115
122
 
116
123
  Args:
117
124
  source: Sitemap URL.
118
- index_name: Target OpenSearch index.
119
- account_id: For multi-tenancy filtering.
120
- collection_id: Collection within account.
125
+ index_name: Target OpenSearch index (use tenant-specific name).
126
+ collection_id: Collection within the index.
121
127
  source_id: Source identifier.
122
128
  **options: Additional loader options.
123
129
 
@@ -1,12 +1,20 @@
1
1
  """Content fetchers for retrieving content from URLs."""
2
2
 
3
+ from gnosisllm_knowledge.core.exceptions import (
4
+ DiscoveryJobFailedError,
5
+ DiscoveryTimeoutError,
6
+ )
3
7
  from gnosisllm_knowledge.fetchers.config import FetcherConfig, NeoreaderConfig
4
8
  from gnosisllm_knowledge.fetchers.http import HTTPContentFetcher
5
9
  from gnosisllm_knowledge.fetchers.neoreader import NeoreaderContentFetcher
10
+ from gnosisllm_knowledge.fetchers.neoreader_discovery import NeoreaderDiscoveryClient
6
11
 
7
12
  __all__ = [
8
13
  "HTTPContentFetcher",
9
14
  "NeoreaderContentFetcher",
15
+ "NeoreaderDiscoveryClient",
10
16
  "FetcherConfig",
11
17
  "NeoreaderConfig",
18
+ "DiscoveryTimeoutError",
19
+ "DiscoveryJobFailedError",
12
20
  ]
@@ -40,6 +40,11 @@ class NeoreaderConfig:
40
40
  remove_selector: CSS selector for elements to remove.
41
41
  with_images: Whether to include image references.
42
42
  with_links: Whether to include link references.
43
+ discovery_enabled: Whether discovery loader is enabled.
44
+ discovery_poll_interval: Interval between status polls in seconds.
45
+ discovery_timeout: Maximum time to wait for discovery completion in seconds.
46
+ discovery_max_depth: Default maximum crawl depth for discovery.
47
+ discovery_max_pages: Default maximum pages to discover.
43
48
  """
44
49
 
45
50
  host: str = "http://localhost:3000"
@@ -50,6 +55,13 @@ class NeoreaderConfig:
50
55
  with_images: bool = False
51
56
  with_links: bool = True
52
57
 
58
+ # Discovery settings
59
+ discovery_enabled: bool = True
60
+ discovery_poll_interval: float = 2.0
61
+ discovery_timeout: float = 600.0
62
+ discovery_max_depth: int = 3
63
+ discovery_max_pages: int = 100
64
+
53
65
  @classmethod
54
66
  def from_env(cls) -> NeoreaderConfig:
55
67
  """Create configuration from environment variables.
@@ -62,6 +74,11 @@ class NeoreaderConfig:
62
74
  - NEOREADER_REMOVE_SELECTOR: CSS selector for removal
63
75
  - NEOREADER_WITH_IMAGES: Include images (true/false)
64
76
  - NEOREADER_WITH_LINKS: Include links (true/false)
77
+ - NEOREADER_DISCOVERY_ENABLED: Enable discovery loader (true/false)
78
+ - NEOREADER_DISCOVERY_POLL_INTERVAL: Discovery poll interval in seconds
79
+ - NEOREADER_DISCOVERY_TIMEOUT: Discovery timeout in seconds
80
+ - NEOREADER_DISCOVERY_MAX_DEPTH: Default max crawl depth
81
+ - NEOREADER_DISCOVERY_MAX_PAGES: Default max pages to discover
65
82
 
66
83
  Returns:
67
84
  NeoreaderConfig populated from environment.
@@ -74,4 +91,14 @@ class NeoreaderConfig:
74
91
  remove_selector=os.getenv("NEOREADER_REMOVE_SELECTOR"),
75
92
  with_images=os.getenv("NEOREADER_WITH_IMAGES", "").lower() == "true",
76
93
  with_links=os.getenv("NEOREADER_WITH_LINKS", "true").lower() == "true",
94
+ discovery_enabled=os.getenv("NEOREADER_DISCOVERY_ENABLED", "true").lower()
95
+ == "true",
96
+ discovery_poll_interval=float(
97
+ os.getenv("NEOREADER_DISCOVERY_POLL_INTERVAL", "2.0")
98
+ ),
99
+ discovery_timeout=float(
100
+ os.getenv("NEOREADER_DISCOVERY_TIMEOUT", "600.0")
101
+ ),
102
+ discovery_max_depth=int(os.getenv("NEOREADER_DISCOVERY_MAX_DEPTH", "3")),
103
+ discovery_max_pages=int(os.getenv("NEOREADER_DISCOVERY_MAX_PAGES", "100")),
77
104
  )
@@ -43,6 +43,15 @@ class NeoreaderContentFetcher:
43
43
  self._config = config or NeoreaderConfig.from_env()
44
44
  self._logger = logging.getLogger(__name__)
45
45
 
46
+ @property
47
+ def config(self) -> NeoreaderConfig:
48
+ """Expose configuration for reuse by discovery client.
49
+
50
+ Returns:
51
+ The Neo Reader configuration used by this fetcher.
52
+ """
53
+ return self._config
54
+
46
55
  async def fetch(self, url: str, **options: Any) -> FetchResult:
47
56
  """Fetch content from a URL using Neoreader.
48
57
 
@@ -181,7 +190,7 @@ class NeoreaderContentFetcher:
181
190
  def _extract_title(self, content: str) -> str | None:
182
191
  """Extract title from markdown content.
183
192
 
184
- Looks for the first H1 heading in the markdown.
193
+ Looks for the first H1 heading in various formats.
185
194
 
186
195
  Args:
187
196
  content: Markdown content.
@@ -189,14 +198,33 @@ class NeoreaderContentFetcher:
189
198
  Returns:
190
199
  Title string or None.
191
200
  """
192
- # Look for first H1 heading
193
201
  lines = content.split("\n")
202
+
203
+ # Look for ATX-style H1 heading (# Title)
194
204
  for line in lines:
195
205
  line = line.strip()
196
206
  if line.startswith("# "):
197
207
  return line[2:].strip()
198
208
 
199
- # Try regex for H1
209
+ # Look for "Title: ..." prefix format (common in Neoreader output)
210
+ for line in lines:
211
+ line = line.strip()
212
+ if line.startswith("Title:"):
213
+ title = line[6:].strip()
214
+ # Stop at "URL" or "Source" if present on same line
215
+ for stop in [" URL", " Source"]:
216
+ if stop in title:
217
+ title = title[:title.index(stop)]
218
+ return title.strip() if title else None
219
+
220
+ # Look for Setext-style H1 (Title followed by === line)
221
+ for i, line in enumerate(lines[:-1]):
222
+ line = line.strip()
223
+ next_line = lines[i + 1].strip() if i + 1 < len(lines) else ""
224
+ if line and next_line and all(c == "=" for c in next_line) and len(next_line) >= 3:
225
+ return line
226
+
227
+ # Try regex for ATX H1
200
228
  match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
201
229
  if match:
202
230
  return match.group(1).strip()