chunksilo 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of chunksilo might be problematic. Click here for more details.
- chunksilo/__init__.py +4 -0
- chunksilo/__main__.py +3 -0
- chunksilo/cfgload.py +163 -0
- chunksilo/cli.py +124 -0
- chunksilo/confluence_html_formatter.py +96 -0
- chunksilo/index.py +1420 -0
- chunksilo/search.py +784 -0
- chunksilo/server.py +110 -0
- chunksilo-2.0.0.dist-info/METADATA +366 -0
- chunksilo-2.0.0.dist-info/RECORD +15 -0
- chunksilo-2.0.0.dist-info/WHEEL +5 -0
- chunksilo-2.0.0.dist-info/entry_points.txt +3 -0
- chunksilo-2.0.0.dist-info/licenses/LICENSE +191 -0
- chunksilo-2.0.0.dist-info/licenses/NOTICE +33 -0
- chunksilo-2.0.0.dist-info/top_level.txt +1 -0
chunksilo/__init__.py
ADDED
chunksilo/__main__.py
ADDED
chunksilo/cfgload.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""
|
|
4
|
+
Shared configuration loading for ChunkSilo.
|
|
5
|
+
|
|
6
|
+
Loads configuration from config.yaml, searching in standard locations.
|
|
7
|
+
"""
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import yaml
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _find_config() -> Path:
|
|
18
|
+
"""Find config.yaml using a priority-based search.
|
|
19
|
+
|
|
20
|
+
Search order:
|
|
21
|
+
1. CHUNKSILO_CONFIG environment variable
|
|
22
|
+
2. ./config.yaml (current working directory)
|
|
23
|
+
3. ~/.config/chunksilo/config.yaml (XDG standard)
|
|
24
|
+
"""
|
|
25
|
+
env_path = os.environ.get("CHUNKSILO_CONFIG")
|
|
26
|
+
if env_path:
|
|
27
|
+
return Path(env_path)
|
|
28
|
+
|
|
29
|
+
cwd_path = Path.cwd() / "config.yaml"
|
|
30
|
+
if cwd_path.exists():
|
|
31
|
+
return cwd_path
|
|
32
|
+
|
|
33
|
+
xdg_path = Path.home() / ".config" / "chunksilo" / "config.yaml"
|
|
34
|
+
if xdg_path.exists():
|
|
35
|
+
return xdg_path
|
|
36
|
+
|
|
37
|
+
# Return cwd path as default (will fall through to defaults if not found)
|
|
38
|
+
return cwd_path
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
CONFIG_PATH = _find_config()
|
|
42
|
+
|
|
43
|
+
_DEFAULTS: dict[str, Any] = {
|
|
44
|
+
"indexing": {
|
|
45
|
+
"directories": ["./data"],
|
|
46
|
+
"defaults": {
|
|
47
|
+
"include": ["**/*.pdf", "**/*.md", "**/*.txt", "**/*.docx", "**/*.doc"],
|
|
48
|
+
"exclude": [],
|
|
49
|
+
"recursive": True,
|
|
50
|
+
},
|
|
51
|
+
"chunk_size": 1600,
|
|
52
|
+
"chunk_overlap": 200,
|
|
53
|
+
},
|
|
54
|
+
"retrieval": {
|
|
55
|
+
"embed_model_name": "BAAI/bge-small-en-v1.5",
|
|
56
|
+
"embed_top_k": 20,
|
|
57
|
+
"rerank_model_name": "ms-marco-MiniLM-L-12-v2",
|
|
58
|
+
"rerank_top_k": 5,
|
|
59
|
+
"rerank_candidates": 100,
|
|
60
|
+
"score_threshold": 0.1,
|
|
61
|
+
"recency_boost": 0.3,
|
|
62
|
+
"recency_half_life_days": 365,
|
|
63
|
+
"bm25_similarity_top_k": 10,
|
|
64
|
+
"offline": False,
|
|
65
|
+
},
|
|
66
|
+
"confluence": {
|
|
67
|
+
"url": "",
|
|
68
|
+
"username": "",
|
|
69
|
+
"api_token": "",
|
|
70
|
+
"timeout": 10.0,
|
|
71
|
+
"max_results": 30,
|
|
72
|
+
},
|
|
73
|
+
"ssl": {
|
|
74
|
+
"ca_bundle_path": "",
|
|
75
|
+
},
|
|
76
|
+
"storage": {
|
|
77
|
+
"storage_dir": "./storage",
|
|
78
|
+
"model_cache_dir": "./models",
|
|
79
|
+
},
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
_config_cache: dict[str, Any] | None = None
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
|
|
86
|
+
"""Deep merge override into base, returning a new dict."""
|
|
87
|
+
result = base.copy()
|
|
88
|
+
for key, value in override.items():
|
|
89
|
+
if key in result and isinstance(result[key], dict) and isinstance(value, dict):
|
|
90
|
+
result[key] = _deep_merge(result[key], value)
|
|
91
|
+
else:
|
|
92
|
+
result[key] = value
|
|
93
|
+
return result
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def load_config(config_path: Path | None = None) -> dict[str, Any]:
|
|
97
|
+
"""Load configuration from YAML file with defaults.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
config_path: Optional path to config file. If None, uses default CONFIG_PATH.
|
|
101
|
+
Results are cached only when using the default path.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Configuration dictionary with defaults merged in.
|
|
105
|
+
"""
|
|
106
|
+
global _config_cache
|
|
107
|
+
|
|
108
|
+
# Return cached config if available (only for default path)
|
|
109
|
+
if _config_cache is not None and config_path is None:
|
|
110
|
+
return _config_cache
|
|
111
|
+
|
|
112
|
+
path = config_path or CONFIG_PATH
|
|
113
|
+
|
|
114
|
+
if not path.exists():
|
|
115
|
+
logger.info("Config file not found at %s; using built-in defaults", path)
|
|
116
|
+
return _DEFAULTS.copy()
|
|
117
|
+
|
|
118
|
+
logger.info("Using config: %s", path)
|
|
119
|
+
|
|
120
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
121
|
+
user_config = yaml.safe_load(f) or {}
|
|
122
|
+
|
|
123
|
+
result = _deep_merge(_DEFAULTS, user_config)
|
|
124
|
+
|
|
125
|
+
# Cache result only for default path
|
|
126
|
+
if config_path is None:
|
|
127
|
+
_config_cache = result
|
|
128
|
+
|
|
129
|
+
return result
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def get(key: str, default: Any = None) -> Any:
|
|
133
|
+
"""Get a config value by dot-notation key.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
key: Dot-separated key path (e.g., 'retrieval.embed_top_k')
|
|
137
|
+
default: Value to return if key not found
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Configuration value or default.
|
|
141
|
+
|
|
142
|
+
Example:
|
|
143
|
+
>>> get('retrieval.embed_top_k')
|
|
144
|
+
20
|
|
145
|
+
>>> get('storage.storage_dir')
|
|
146
|
+
'./storage'
|
|
147
|
+
"""
|
|
148
|
+
config = load_config()
|
|
149
|
+
keys = key.split(".")
|
|
150
|
+
value: Any = config
|
|
151
|
+
for k in keys:
|
|
152
|
+
if isinstance(value, dict) and k in value:
|
|
153
|
+
value = value[k]
|
|
154
|
+
else:
|
|
155
|
+
return default
|
|
156
|
+
return value
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def reload_config() -> dict[str, Any]:
|
|
160
|
+
"""Force reload configuration from disk, clearing the cache."""
|
|
161
|
+
global _config_cache
|
|
162
|
+
_config_cache = None
|
|
163
|
+
return load_config()
|
chunksilo/cli.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""
|
|
4
|
+
CLI entry point for the chunksilo command.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
chunksilo "query text" [--date-from YYYY-MM-DD] [--date-to YYYY-MM-DD] [--config PATH] [--json]
|
|
8
|
+
chunksilo --build-index [--config PATH]
|
|
9
|
+
chunksilo --download-models [--config PATH]
|
|
10
|
+
"""
|
|
11
|
+
import argparse
|
|
12
|
+
import json
|
|
13
|
+
import logging
|
|
14
|
+
import os
|
|
15
|
+
import sys
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def main():
|
|
20
|
+
"""Entry point for the `chunksilo` command."""
|
|
21
|
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
22
|
+
|
|
23
|
+
parser = argparse.ArgumentParser(
|
|
24
|
+
prog="chunksilo",
|
|
25
|
+
description="Search indexed documents using ChunkSilo",
|
|
26
|
+
epilog=(
|
|
27
|
+
"config file search order (first found wins):\n"
|
|
28
|
+
" 1. --config PATH argument\n"
|
|
29
|
+
" 2. CHUNKSILO_CONFIG environment variable\n"
|
|
30
|
+
" 3. ./config.yaml\n"
|
|
31
|
+
" 4. ~/.config/chunksilo/config.yaml\n"
|
|
32
|
+
" If none found, built-in defaults are used."
|
|
33
|
+
),
|
|
34
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument("query", nargs="?", default=None, help="Search query text")
|
|
37
|
+
parser.add_argument("--date-from", help="Start date filter (YYYY-MM-DD, inclusive)")
|
|
38
|
+
parser.add_argument("--date-to", help="End date filter (YYYY-MM-DD, inclusive)")
|
|
39
|
+
parser.add_argument("--config", help="Path to config.yaml (overrides auto-discovery)")
|
|
40
|
+
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
|
41
|
+
parser.add_argument("-v", "--verbose", action="store_true",
|
|
42
|
+
help="Show diagnostic messages (model loading, search stats)")
|
|
43
|
+
parser.add_argument("--build-index", action="store_true",
|
|
44
|
+
help="Build or update the search index, then exit")
|
|
45
|
+
parser.add_argument("--download-models", action="store_true",
|
|
46
|
+
help="Download required ML models, then exit")
|
|
47
|
+
|
|
48
|
+
args = parser.parse_args()
|
|
49
|
+
|
|
50
|
+
log_level = logging.INFO if args.verbose or args.build_index or args.download_models else logging.WARNING
|
|
51
|
+
logging.basicConfig(level=log_level, format="%(message)s", stream=sys.stderr)
|
|
52
|
+
|
|
53
|
+
config_path = Path(args.config) if args.config else None
|
|
54
|
+
|
|
55
|
+
if args.build_index or args.download_models:
|
|
56
|
+
from .index import build_index
|
|
57
|
+
|
|
58
|
+
build_index(
|
|
59
|
+
download_only=args.download_models,
|
|
60
|
+
config_path=config_path,
|
|
61
|
+
)
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
if not args.query:
|
|
65
|
+
parser.error("query is required (or use --build-index / --download-models)")
|
|
66
|
+
|
|
67
|
+
from .search import run_search
|
|
68
|
+
|
|
69
|
+
result = run_search(
|
|
70
|
+
query=args.query,
|
|
71
|
+
date_from=args.date_from,
|
|
72
|
+
date_to=args.date_to,
|
|
73
|
+
config_path=config_path,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
if args.json:
|
|
77
|
+
print(json.dumps(result, indent=2))
|
|
78
|
+
return
|
|
79
|
+
|
|
80
|
+
# Check for errors
|
|
81
|
+
if result.get("error"):
|
|
82
|
+
print(f"Error: {result['error']}", file=sys.stderr)
|
|
83
|
+
sys.exit(1)
|
|
84
|
+
|
|
85
|
+
# Human-readable output
|
|
86
|
+
matched_files = result.get("matched_files", [])
|
|
87
|
+
chunks = result.get("chunks", [])
|
|
88
|
+
|
|
89
|
+
if matched_files:
|
|
90
|
+
print(f"\nMatched files ({len(matched_files)}):")
|
|
91
|
+
for f in matched_files:
|
|
92
|
+
print(f" {f.get('uri', 'unknown')} (score: {f.get('score', 0):.4f})")
|
|
93
|
+
|
|
94
|
+
if not chunks:
|
|
95
|
+
print("\nNo results found.")
|
|
96
|
+
return
|
|
97
|
+
|
|
98
|
+
print(f"\nResults ({len(chunks)}):\n")
|
|
99
|
+
|
|
100
|
+
for i, chunk in enumerate(chunks, 1):
|
|
101
|
+
loc = chunk.get("location", {})
|
|
102
|
+
uri = loc.get("uri") or "unknown"
|
|
103
|
+
heading = " > ".join(loc.get("heading_path") or [])
|
|
104
|
+
score = chunk.get("score", 0)
|
|
105
|
+
|
|
106
|
+
print(f"[{i}] {uri}")
|
|
107
|
+
if heading:
|
|
108
|
+
print(f" Heading: {heading}")
|
|
109
|
+
if loc.get("page"):
|
|
110
|
+
print(f" Page: {loc['page']}")
|
|
111
|
+
if loc.get("line"):
|
|
112
|
+
print(f" Line: {loc['line']}")
|
|
113
|
+
print(f" Score: {score:.3f}")
|
|
114
|
+
|
|
115
|
+
text = chunk.get("text", "")
|
|
116
|
+
preview = text[:200].replace("\n", " ")
|
|
117
|
+
if len(text) > 200:
|
|
118
|
+
preview += "..."
|
|
119
|
+
print(f" {preview}")
|
|
120
|
+
print()
|
|
121
|
+
|
|
122
|
+
retrieval_time = result.get("retrieval_time", "")
|
|
123
|
+
if retrieval_time:
|
|
124
|
+
print(f"({retrieval_time})")
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
"""
|
|
3
|
+
Custom HTML formatter for Confluence content.
|
|
4
|
+
|
|
5
|
+
TEMPORARY FIX: Strips syntax highlighting <span> tags that cause issues
|
|
6
|
+
with markdownify. Remove this file when the upstream issue is fixed.
|
|
7
|
+
|
|
8
|
+
To remove this workaround:
|
|
9
|
+
1. Delete this file
|
|
10
|
+
2. Remove the import and patch_confluence_reader() call from chunksilo.py
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from bs4 import BeautifulSoup
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def clean_confluence_html(html: str) -> str:
|
|
17
|
+
"""
|
|
18
|
+
Pre-process Confluence HTML to remove problematic syntax highlighting spans.
|
|
19
|
+
|
|
20
|
+
Confluence wraps code in many <span> tags for syntax highlighting that can
|
|
21
|
+
cause markdownify to insert unwanted newlines between characters.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
html: Raw HTML string from Confluence
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Cleaned HTML with syntax highlighting spans unwrapped
|
|
28
|
+
"""
|
|
29
|
+
if not html:
|
|
30
|
+
return html
|
|
31
|
+
|
|
32
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
33
|
+
|
|
34
|
+
# Find and unwrap syntax highlighting spans
|
|
35
|
+
# These are typically <span> tags with class attributes for highlighting
|
|
36
|
+
# or inline style attributes for colors
|
|
37
|
+
for span in soup.find_all("span"):
|
|
38
|
+
# Check if this looks like a syntax highlighting span
|
|
39
|
+
span_class = span.get("class", [])
|
|
40
|
+
span_style = span.get("style", "")
|
|
41
|
+
|
|
42
|
+
# Common patterns: spans with color styles, or code-related classes
|
|
43
|
+
is_syntax_span = (
|
|
44
|
+
"color" in span_style
|
|
45
|
+
or "background" in span_style
|
|
46
|
+
or any("code" in c for c in span_class if isinstance(c, str))
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
if is_syntax_span:
|
|
50
|
+
span.unwrap() # Replace span with its contents
|
|
51
|
+
|
|
52
|
+
return str(soup)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class CleanHtmlTextParser:
|
|
56
|
+
"""
|
|
57
|
+
Drop-in replacement for llama_index's HtmlTextParser that cleans
|
|
58
|
+
syntax highlighting spans before conversion.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
def __init__(self):
|
|
62
|
+
try:
|
|
63
|
+
from markdownify import markdownify # noqa: F401
|
|
64
|
+
except ImportError:
|
|
65
|
+
raise ImportError(
|
|
66
|
+
"`markdownify` package not found, please run `pip install markdownify`"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def convert(self, html: str) -> str:
|
|
70
|
+
from markdownify import markdownify
|
|
71
|
+
|
|
72
|
+
if not html:
|
|
73
|
+
return ""
|
|
74
|
+
|
|
75
|
+
# Clean the HTML first
|
|
76
|
+
cleaned_html = clean_confluence_html(html)
|
|
77
|
+
|
|
78
|
+
return markdownify(
|
|
79
|
+
cleaned_html,
|
|
80
|
+
heading_style="ATX",
|
|
81
|
+
bullets="*",
|
|
82
|
+
strip=["script", "style"],
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def patch_confluence_reader():
|
|
87
|
+
"""
|
|
88
|
+
Monkey-patch the ConfluenceReader to use our clean HTML parser.
|
|
89
|
+
Call this before creating ConfluenceReader instances.
|
|
90
|
+
"""
|
|
91
|
+
try:
|
|
92
|
+
import llama_index.readers.confluence.html_parser as html_parser_module
|
|
93
|
+
|
|
94
|
+
html_parser_module.HtmlTextParser = CleanHtmlTextParser
|
|
95
|
+
except ImportError:
|
|
96
|
+
pass # ConfluenceReader not installed
|