chunksilo 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of chunksilo might be problematic. Click here for more details.

chunksilo/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """ChunkSilo - Local RAG-based semantic document search."""
3
+
4
+ __version__ = "2.0.0"
chunksilo/__main__.py ADDED
@@ -0,0 +1,3 @@
1
+ from chunksilo.cli import main
2
+
3
+ main()
chunksilo/cfgload.py ADDED
@@ -0,0 +1,163 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """
4
+ Shared configuration loading for ChunkSilo.
5
+
6
+ Loads configuration from config.yaml, searching in standard locations.
7
+ """
8
+ import logging
9
+ import os
10
+ import yaml
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def _find_config() -> Path:
18
+ """Find config.yaml using a priority-based search.
19
+
20
+ Search order:
21
+ 1. CHUNKSILO_CONFIG environment variable
22
+ 2. ./config.yaml (current working directory)
23
+ 3. ~/.config/chunksilo/config.yaml (XDG standard)
24
+ """
25
+ env_path = os.environ.get("CHUNKSILO_CONFIG")
26
+ if env_path:
27
+ return Path(env_path)
28
+
29
+ cwd_path = Path.cwd() / "config.yaml"
30
+ if cwd_path.exists():
31
+ return cwd_path
32
+
33
+ xdg_path = Path.home() / ".config" / "chunksilo" / "config.yaml"
34
+ if xdg_path.exists():
35
+ return xdg_path
36
+
37
+ # Return cwd path as default (will fall through to defaults if not found)
38
+ return cwd_path
39
+
40
+
41
+ CONFIG_PATH = _find_config()
42
+
43
+ _DEFAULTS: dict[str, Any] = {
44
+ "indexing": {
45
+ "directories": ["./data"],
46
+ "defaults": {
47
+ "include": ["**/*.pdf", "**/*.md", "**/*.txt", "**/*.docx", "**/*.doc"],
48
+ "exclude": [],
49
+ "recursive": True,
50
+ },
51
+ "chunk_size": 1600,
52
+ "chunk_overlap": 200,
53
+ },
54
+ "retrieval": {
55
+ "embed_model_name": "BAAI/bge-small-en-v1.5",
56
+ "embed_top_k": 20,
57
+ "rerank_model_name": "ms-marco-MiniLM-L-12-v2",
58
+ "rerank_top_k": 5,
59
+ "rerank_candidates": 100,
60
+ "score_threshold": 0.1,
61
+ "recency_boost": 0.3,
62
+ "recency_half_life_days": 365,
63
+ "bm25_similarity_top_k": 10,
64
+ "offline": False,
65
+ },
66
+ "confluence": {
67
+ "url": "",
68
+ "username": "",
69
+ "api_token": "",
70
+ "timeout": 10.0,
71
+ "max_results": 30,
72
+ },
73
+ "ssl": {
74
+ "ca_bundle_path": "",
75
+ },
76
+ "storage": {
77
+ "storage_dir": "./storage",
78
+ "model_cache_dir": "./models",
79
+ },
80
+ }
81
+
82
+ _config_cache: dict[str, Any] | None = None
83
+
84
+
85
+ def _deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
86
+ """Deep merge override into base, returning a new dict."""
87
+ result = base.copy()
88
+ for key, value in override.items():
89
+ if key in result and isinstance(result[key], dict) and isinstance(value, dict):
90
+ result[key] = _deep_merge(result[key], value)
91
+ else:
92
+ result[key] = value
93
+ return result
94
+
95
+
96
+ def load_config(config_path: Path | None = None) -> dict[str, Any]:
97
+ """Load configuration from YAML file with defaults.
98
+
99
+ Args:
100
+ config_path: Optional path to config file. If None, uses default CONFIG_PATH.
101
+ Results are cached only when using the default path.
102
+
103
+ Returns:
104
+ Configuration dictionary with defaults merged in.
105
+ """
106
+ global _config_cache
107
+
108
+ # Return cached config if available (only for default path)
109
+ if _config_cache is not None and config_path is None:
110
+ return _config_cache
111
+
112
+ path = config_path or CONFIG_PATH
113
+
114
+ if not path.exists():
115
+ logger.info("Config file not found at %s; using built-in defaults", path)
116
+ return _DEFAULTS.copy()
117
+
118
+ logger.info("Using config: %s", path)
119
+
120
+ with open(path, "r", encoding="utf-8") as f:
121
+ user_config = yaml.safe_load(f) or {}
122
+
123
+ result = _deep_merge(_DEFAULTS, user_config)
124
+
125
+ # Cache result only for default path
126
+ if config_path is None:
127
+ _config_cache = result
128
+
129
+ return result
130
+
131
+
132
+ def get(key: str, default: Any = None) -> Any:
133
+ """Get a config value by dot-notation key.
134
+
135
+ Args:
136
+ key: Dot-separated key path (e.g., 'retrieval.embed_top_k')
137
+ default: Value to return if key not found
138
+
139
+ Returns:
140
+ Configuration value or default.
141
+
142
+ Example:
143
+ >>> get('retrieval.embed_top_k')
144
+ 20
145
+ >>> get('storage.storage_dir')
146
+ './storage'
147
+ """
148
+ config = load_config()
149
+ keys = key.split(".")
150
+ value: Any = config
151
+ for k in keys:
152
+ if isinstance(value, dict) and k in value:
153
+ value = value[k]
154
+ else:
155
+ return default
156
+ return value
157
+
158
+
159
+ def reload_config() -> dict[str, Any]:
160
+ """Force reload configuration from disk, clearing the cache."""
161
+ global _config_cache
162
+ _config_cache = None
163
+ return load_config()
chunksilo/cli.py ADDED
@@ -0,0 +1,124 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """
4
+ CLI entry point for the chunksilo command.
5
+
6
+ Usage:
7
+ chunksilo "query text" [--date-from YYYY-MM-DD] [--date-to YYYY-MM-DD] [--config PATH] [--json]
8
+ chunksilo --build-index [--config PATH]
9
+ chunksilo --download-models [--config PATH]
10
+ """
11
+ import argparse
12
+ import json
13
+ import logging
14
+ import os
15
+ import sys
16
+ from pathlib import Path
17
+
18
+
19
+ def main():
20
+ """Entry point for the `chunksilo` command."""
21
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
22
+
23
+ parser = argparse.ArgumentParser(
24
+ prog="chunksilo",
25
+ description="Search indexed documents using ChunkSilo",
26
+ epilog=(
27
+ "config file search order (first found wins):\n"
28
+ " 1. --config PATH argument\n"
29
+ " 2. CHUNKSILO_CONFIG environment variable\n"
30
+ " 3. ./config.yaml\n"
31
+ " 4. ~/.config/chunksilo/config.yaml\n"
32
+ " If none found, built-in defaults are used."
33
+ ),
34
+ formatter_class=argparse.RawDescriptionHelpFormatter,
35
+ )
36
+ parser.add_argument("query", nargs="?", default=None, help="Search query text")
37
+ parser.add_argument("--date-from", help="Start date filter (YYYY-MM-DD, inclusive)")
38
+ parser.add_argument("--date-to", help="End date filter (YYYY-MM-DD, inclusive)")
39
+ parser.add_argument("--config", help="Path to config.yaml (overrides auto-discovery)")
40
+ parser.add_argument("--json", action="store_true", help="Output as JSON")
41
+ parser.add_argument("-v", "--verbose", action="store_true",
42
+ help="Show diagnostic messages (model loading, search stats)")
43
+ parser.add_argument("--build-index", action="store_true",
44
+ help="Build or update the search index, then exit")
45
+ parser.add_argument("--download-models", action="store_true",
46
+ help="Download required ML models, then exit")
47
+
48
+ args = parser.parse_args()
49
+
50
+ log_level = logging.INFO if args.verbose or args.build_index or args.download_models else logging.WARNING
51
+ logging.basicConfig(level=log_level, format="%(message)s", stream=sys.stderr)
52
+
53
+ config_path = Path(args.config) if args.config else None
54
+
55
+ if args.build_index or args.download_models:
56
+ from .index import build_index
57
+
58
+ build_index(
59
+ download_only=args.download_models,
60
+ config_path=config_path,
61
+ )
62
+ return
63
+
64
+ if not args.query:
65
+ parser.error("query is required (or use --build-index / --download-models)")
66
+
67
+ from .search import run_search
68
+
69
+ result = run_search(
70
+ query=args.query,
71
+ date_from=args.date_from,
72
+ date_to=args.date_to,
73
+ config_path=config_path,
74
+ )
75
+
76
+ if args.json:
77
+ print(json.dumps(result, indent=2))
78
+ return
79
+
80
+ # Check for errors
81
+ if result.get("error"):
82
+ print(f"Error: {result['error']}", file=sys.stderr)
83
+ sys.exit(1)
84
+
85
+ # Human-readable output
86
+ matched_files = result.get("matched_files", [])
87
+ chunks = result.get("chunks", [])
88
+
89
+ if matched_files:
90
+ print(f"\nMatched files ({len(matched_files)}):")
91
+ for f in matched_files:
92
+ print(f" {f.get('uri', 'unknown')} (score: {f.get('score', 0):.4f})")
93
+
94
+ if not chunks:
95
+ print("\nNo results found.")
96
+ return
97
+
98
+ print(f"\nResults ({len(chunks)}):\n")
99
+
100
+ for i, chunk in enumerate(chunks, 1):
101
+ loc = chunk.get("location", {})
102
+ uri = loc.get("uri") or "unknown"
103
+ heading = " > ".join(loc.get("heading_path") or [])
104
+ score = chunk.get("score", 0)
105
+
106
+ print(f"[{i}] {uri}")
107
+ if heading:
108
+ print(f" Heading: {heading}")
109
+ if loc.get("page"):
110
+ print(f" Page: {loc['page']}")
111
+ if loc.get("line"):
112
+ print(f" Line: {loc['line']}")
113
+ print(f" Score: {score:.3f}")
114
+
115
+ text = chunk.get("text", "")
116
+ preview = text[:200].replace("\n", " ")
117
+ if len(text) > 200:
118
+ preview += "..."
119
+ print(f" {preview}")
120
+ print()
121
+
122
+ retrieval_time = result.get("retrieval_time", "")
123
+ if retrieval_time:
124
+ print(f"({retrieval_time})")
@@ -0,0 +1,96 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """
3
+ Custom HTML formatter for Confluence content.
4
+
5
+ TEMPORARY FIX: Strips syntax highlighting <span> tags that cause issues
6
+ with markdownify. Remove this file when the upstream issue is fixed.
7
+
8
+ To remove this workaround:
9
+ 1. Delete this file
10
+ 2. Remove the import and patch_confluence_reader() call from chunksilo.py
11
+ """
12
+
13
+ from bs4 import BeautifulSoup
14
+
15
+
16
+ def clean_confluence_html(html: str) -> str:
17
+ """
18
+ Pre-process Confluence HTML to remove problematic syntax highlighting spans.
19
+
20
+ Confluence wraps code in many <span> tags for syntax highlighting that can
21
+ cause markdownify to insert unwanted newlines between characters.
22
+
23
+ Args:
24
+ html: Raw HTML string from Confluence
25
+
26
+ Returns:
27
+ Cleaned HTML with syntax highlighting spans unwrapped
28
+ """
29
+ if not html:
30
+ return html
31
+
32
+ soup = BeautifulSoup(html, "html.parser")
33
+
34
+ # Find and unwrap syntax highlighting spans
35
+ # These are typically <span> tags with class attributes for highlighting
36
+ # or inline style attributes for colors
37
+ for span in soup.find_all("span"):
38
+ # Check if this looks like a syntax highlighting span
39
+ span_class = span.get("class", [])
40
+ span_style = span.get("style", "")
41
+
42
+ # Common patterns: spans with color styles, or code-related classes
43
+ is_syntax_span = (
44
+ "color" in span_style
45
+ or "background" in span_style
46
+ or any("code" in c for c in span_class if isinstance(c, str))
47
+ )
48
+
49
+ if is_syntax_span:
50
+ span.unwrap() # Replace span with its contents
51
+
52
+ return str(soup)
53
+
54
+
55
+ class CleanHtmlTextParser:
56
+ """
57
+ Drop-in replacement for llama_index's HtmlTextParser that cleans
58
+ syntax highlighting spans before conversion.
59
+ """
60
+
61
+ def __init__(self):
62
+ try:
63
+ from markdownify import markdownify # noqa: F401
64
+ except ImportError:
65
+ raise ImportError(
66
+ "`markdownify` package not found, please run `pip install markdownify`"
67
+ )
68
+
69
+ def convert(self, html: str) -> str:
70
+ from markdownify import markdownify
71
+
72
+ if not html:
73
+ return ""
74
+
75
+ # Clean the HTML first
76
+ cleaned_html = clean_confluence_html(html)
77
+
78
+ return markdownify(
79
+ cleaned_html,
80
+ heading_style="ATX",
81
+ bullets="*",
82
+ strip=["script", "style"],
83
+ )
84
+
85
+
86
+ def patch_confluence_reader():
87
+ """
88
+ Monkey-patch the ConfluenceReader to use our clean HTML parser.
89
+ Call this before creating ConfluenceReader instances.
90
+ """
91
+ try:
92
+ import llama_index.readers.confluence.html_parser as html_parser_module
93
+
94
+ html_parser_module.HtmlTextParser = CleanHtmlTextParser
95
+ except ImportError:
96
+ pass # ConfluenceReader not installed