chunksilo 2.1.1__tar.gz → 2.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of chunksilo might be problematic. Click here for more details.
- {chunksilo-2.1.1/src/chunksilo.egg-info → chunksilo-2.1.3}/PKG-INFO +1 -1
- {chunksilo-2.1.1 → chunksilo-2.1.3}/pyproject.toml +1 -1
- {chunksilo-2.1.1 → chunksilo-2.1.3}/src/chunksilo/__init__.py +1 -1
- {chunksilo-2.1.1 → chunksilo-2.1.3}/src/chunksilo/search.py +92 -35
- {chunksilo-2.1.1 → chunksilo-2.1.3/src/chunksilo.egg-info}/PKG-INFO +1 -1
- {chunksilo-2.1.1 → chunksilo-2.1.3}/test/test_jira_integration.py +44 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/LICENSE +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/NOTICE +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/README.md +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/requirements.txt +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/setup.cfg +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/src/chunksilo/__main__.py +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/src/chunksilo/cfgload.py +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/src/chunksilo/cli.py +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/src/chunksilo/confluence_html_formatter.py +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/src/chunksilo/index.py +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/src/chunksilo/server.py +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/src/chunksilo.egg-info/SOURCES.txt +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/src/chunksilo.egg-info/dependency_links.txt +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/src/chunksilo.egg-info/entry_points.txt +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/src/chunksilo.egg-info/requires.txt +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/src/chunksilo.egg-info/top_level.txt +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/test/test_chunk_location.py +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/test/test_confluence_html_formatter.py +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/test/test_error_handling.py +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/test/test_heading_path_integration.py +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/test/test_incremental_ingest.py +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/test/test_rag_metrics.py +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/test/test_retrieval_only.py +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/test/test_system.py +0 -0
- {chunksilo-2.1.1 → chunksilo-2.1.3}/test/test_utils.py +0 -0
|
@@ -389,8 +389,8 @@ def _prepare_jira_jql_query(query: str, config: dict[str, Any]) -> str:
|
|
|
389
389
|
"""Construct a JQL query from user search terms and configuration.
|
|
390
390
|
|
|
391
391
|
Uses Jira's 'text' field which searches across Summary, Description,
|
|
392
|
-
Environment, Comments, and all text custom fields.
|
|
393
|
-
|
|
392
|
+
Environment, Comments, and all text custom fields. Additionally detects
|
|
393
|
+
Jira issue keys (e.g., "ABEI-1660") and includes exact key searches.
|
|
394
394
|
|
|
395
395
|
Note: Fuzzy search operators (~) are deprecated in Jira Cloud but work
|
|
396
396
|
in Data Center/Server. ChunkSilo's semantic search (embeddings + reranker)
|
|
@@ -414,20 +414,35 @@ def _prepare_jira_jql_query(query: str, config: dict[str, Any]) -> str:
|
|
|
414
414
|
References:
|
|
415
415
|
- Jira text field: https://support.atlassian.com/jira-software-cloud/docs/search-for-work-items-using-the-text-field/
|
|
416
416
|
- JQL operators: https://support.atlassian.com/jira-software-cloud/docs/jql-operators/
|
|
417
|
+
- JQL key field: https://support.atlassian.com/jira-software-cloud/docs/search-by-issue-key/
|
|
417
418
|
"""
|
|
419
|
+
# Detect Jira issue keys in the query (e.g., "ABEI-1660", "PROJ-123")
|
|
420
|
+
# Pattern matches: 1+ uppercase letters/digits, hyphen, 1+ digits
|
|
421
|
+
# Case-insensitive matching, but preserve original case for extraction
|
|
422
|
+
issue_key_pattern = r'\b([A-Z][A-Z0-9]+-\d+)\b'
|
|
423
|
+
detected_keys = re.findall(issue_key_pattern, query, re.IGNORECASE)
|
|
424
|
+
|
|
425
|
+
# Build key search clauses for exact issue key matches
|
|
426
|
+
key_clauses = []
|
|
427
|
+
if detected_keys:
|
|
428
|
+
# Normalize to uppercase (Jira keys are case-insensitive)
|
|
429
|
+
unique_keys = list(dict.fromkeys(k.upper() for k in detected_keys))
|
|
430
|
+
key_clauses = [f'key = "{key}"' for key in unique_keys]
|
|
431
|
+
logger.debug(f"Detected Jira issue keys in query: {unique_keys}")
|
|
432
|
+
|
|
418
433
|
# Reuse Confluence query term preparation for stopword filtering
|
|
419
434
|
# This gives us a clean list of meaningful search terms
|
|
420
435
|
query_terms = _prepare_confluence_query_terms(query)
|
|
421
436
|
|
|
422
437
|
# Build the text search clause
|
|
423
438
|
# Using JQL 'text' field which searches across all text fields for broad recall
|
|
439
|
+
text_clause = ""
|
|
424
440
|
if not query_terms:
|
|
425
441
|
# No meaningful terms after filtering, use original query
|
|
426
442
|
escaped = query.strip().replace('"', '\\"')
|
|
427
|
-
if not
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
text_clause = f'text ~ "{escaped}"'
|
|
443
|
+
if escaped and not detected_keys:
|
|
444
|
+
# Only add text clause if we don't have issue keys
|
|
445
|
+
text_clause = f'text ~ "{escaped}"'
|
|
431
446
|
elif len(query_terms) == 1:
|
|
432
447
|
# Single term - simple text search
|
|
433
448
|
text_clause = f'text ~ "{query_terms[0]}"'
|
|
@@ -437,6 +452,21 @@ def _prepare_jira_jql_query(query: str, config: dict[str, Any]) -> str:
|
|
|
437
452
|
text_conditions = ' OR '.join([f'text ~ "{term}"' for term in query_terms])
|
|
438
453
|
text_clause = f'({text_conditions})'
|
|
439
454
|
|
|
455
|
+
# Combine key and text searches
|
|
456
|
+
if key_clauses and text_clause:
|
|
457
|
+
# Search both by key and text content
|
|
458
|
+
combined_clause = f'({" OR ".join(key_clauses)} OR {text_clause})'
|
|
459
|
+
elif key_clauses:
|
|
460
|
+
# Only key searches
|
|
461
|
+
combined_clause = " OR ".join(key_clauses)
|
|
462
|
+
elif text_clause:
|
|
463
|
+
# Only text search
|
|
464
|
+
combined_clause = text_clause
|
|
465
|
+
else:
|
|
466
|
+
# No valid search terms
|
|
467
|
+
logger.warning("Jira search skipped: empty query after processing")
|
|
468
|
+
return ""
|
|
469
|
+
|
|
440
470
|
# Add project filter if configured
|
|
441
471
|
# Empty projects list means search all accessible projects
|
|
442
472
|
projects = config["jira"].get("projects", [])
|
|
@@ -444,9 +474,9 @@ def _prepare_jira_jql_query(query: str, config: dict[str, Any]) -> str:
|
|
|
444
474
|
# Restrict search to specific project keys
|
|
445
475
|
project_list = ", ".join([f'"{p}"' for p in projects])
|
|
446
476
|
project_clause = f'project IN ({project_list})'
|
|
447
|
-
jql = f'{
|
|
477
|
+
jql = f'{combined_clause} AND {project_clause}'
|
|
448
478
|
else:
|
|
449
|
-
jql =
|
|
479
|
+
jql = combined_clause
|
|
450
480
|
|
|
451
481
|
# Order by updated DESC for recency
|
|
452
482
|
# This enables ChunkSilo's recency boost feature and returns most relevant recent issues first
|
|
@@ -738,8 +768,8 @@ def _search_jira(query: str, config: dict[str, Any]) -> list[NodeWithScore]:
|
|
|
738
768
|
|
|
739
769
|
Configuration Requirements:
|
|
740
770
|
config["jira"]["url"]: Jira base URL (empty = disabled)
|
|
741
|
-
config["jira"]["username"]: Jira username
|
|
742
|
-
config["jira"]["api_token"]:
|
|
771
|
+
config["jira"]["username"]: Jira username/email (required for Cloud, optional for Server PAT)
|
|
772
|
+
config["jira"]["api_token"]: API token (Cloud) or Personal Access Token (Server/Data Center)
|
|
743
773
|
config["jira"]["max_results"]: Maximum issues to return
|
|
744
774
|
config["jira"]["projects"]: List of project keys (empty = all)
|
|
745
775
|
config["jira"]["include_comments"]: Include issue comments
|
|
@@ -763,8 +793,12 @@ def _search_jira(query: str, config: dict[str, Any]) -> list[NodeWithScore]:
|
|
|
763
793
|
- Automatically configured through jira_options["verify"]
|
|
764
794
|
|
|
765
795
|
Authentication:
|
|
766
|
-
-
|
|
767
|
-
|
|
796
|
+
- Jira Cloud: Set both username (email) and api_token (API token)
|
|
797
|
+
Uses basic auth internally
|
|
798
|
+
- Jira Server/Data Center with PAT: Set only api_token (Personal Access Token)
|
|
799
|
+
Leave username empty; uses bearer token auth internally
|
|
800
|
+
- Jira Server/Data Center with password: Set username and api_token (password)
|
|
801
|
+
Uses basic auth internally (if basic auth enabled on server)
|
|
768
802
|
|
|
769
803
|
References:
|
|
770
804
|
- Jira REST API: https://developer.atlassian.com/cloud/jira/platform/rest/v3/
|
|
@@ -790,13 +824,10 @@ def _search_jira(query: str, config: dict[str, Any]) -> list[NodeWithScore]:
|
|
|
790
824
|
ca_bundle_path = config["ssl"]["ca_bundle_path"] or None
|
|
791
825
|
|
|
792
826
|
# Validate required credentials are present
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
if not api_token:
|
|
798
|
-
missing.append("jira.api_token")
|
|
799
|
-
logger.warning(f"Jira search skipped: missing {', '.join(missing)} in config")
|
|
827
|
+
# For Jira Cloud: both username and api_token required (basic auth)
|
|
828
|
+
# For Jira Server/Data Center with PAT: only api_token required (token auth)
|
|
829
|
+
if not api_token:
|
|
830
|
+
logger.warning("Jira search skipped: missing jira.api_token in config")
|
|
800
831
|
return []
|
|
801
832
|
|
|
802
833
|
try:
|
|
@@ -806,12 +837,23 @@ def _search_jira(query: str, config: dict[str, Any]) -> list[NodeWithScore]:
|
|
|
806
837
|
if ca_bundle_path:
|
|
807
838
|
jira_options["verify"] = ca_bundle_path
|
|
808
839
|
|
|
809
|
-
#
|
|
810
|
-
#
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
840
|
+
# Choose authentication method based on credentials provided:
|
|
841
|
+
# - Username + API token: Use basic auth (Jira Cloud, or Server with password)
|
|
842
|
+
# - API token only: Use token auth (Jira Server/Data Center with PAT)
|
|
843
|
+
if username:
|
|
844
|
+
# Basic auth for Jira Cloud (username + API token)
|
|
845
|
+
# Also works for Jira Server with username + password
|
|
846
|
+
jira_client = JIRA(
|
|
847
|
+
options=jira_options,
|
|
848
|
+
basic_auth=(username, api_token)
|
|
849
|
+
)
|
|
850
|
+
else:
|
|
851
|
+
# Token auth for Jira Server/Data Center Personal Access Tokens (PAT)
|
|
852
|
+
# PATs are used alone without a username
|
|
853
|
+
jira_client = JIRA(
|
|
854
|
+
options=jira_options,
|
|
855
|
+
token_auth=api_token
|
|
856
|
+
)
|
|
815
857
|
|
|
816
858
|
# Construct JQL with text search and project filtering
|
|
817
859
|
jql = _prepare_jira_jql_query(query, config)
|
|
@@ -1116,6 +1158,7 @@ def run_search(
|
|
|
1116
1158
|
rerank_request = RerankRequest(query=enhanced_query, passages=passages)
|
|
1117
1159
|
reranked_results = reranker.rerank(rerank_request)
|
|
1118
1160
|
|
|
1161
|
+
# Build text-to-node mapping for fallback text matching
|
|
1119
1162
|
text_to_indices: dict[str, list[tuple[int, NodeWithScore]]] = {}
|
|
1120
1163
|
for idx, node in enumerate(nodes):
|
|
1121
1164
|
node_text = node.node.get_content() or ""
|
|
@@ -1126,20 +1169,34 @@ def run_search(
|
|
|
1126
1169
|
reranked_nodes = []
|
|
1127
1170
|
seen_indices: set[int] = set()
|
|
1128
1171
|
for result in reranked_results:
|
|
1129
|
-
doc_text = result.get("text", "")
|
|
1130
1172
|
score = result.get("score", 0.0)
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1173
|
+
result_idx = result.get("id")
|
|
1174
|
+
|
|
1175
|
+
# Primary: match by index (flashrank returns original passage index)
|
|
1176
|
+
if result_idx is not None and 0 <= result_idx < len(nodes):
|
|
1177
|
+
if result_idx not in seen_indices:
|
|
1178
|
+
node = nodes[result_idx]
|
|
1179
|
+
reranked_nodes.append(node)
|
|
1180
|
+
rerank_scores[id(node)] = float(score)
|
|
1181
|
+
seen_indices.add(result_idx)
|
|
1182
|
+
else:
|
|
1183
|
+
# Fallback: match by text content
|
|
1184
|
+
doc_text = result.get("text", "")
|
|
1185
|
+
if doc_text in text_to_indices:
|
|
1186
|
+
for idx, node in text_to_indices[doc_text]:
|
|
1187
|
+
if idx not in seen_indices:
|
|
1188
|
+
reranked_nodes.append(node)
|
|
1189
|
+
rerank_scores[id(node)] = float(score)
|
|
1190
|
+
seen_indices.add(idx)
|
|
1191
|
+
break
|
|
1192
|
+
|
|
1193
|
+
# Add remaining unmatched nodes with minimum matched score
|
|
1194
|
+
# This ensures Jira/Confluence results aren't dropped due to text mismatch
|
|
1195
|
+
min_score = min(rerank_scores.values()) if rerank_scores else 0.0
|
|
1140
1196
|
for idx, node in enumerate(nodes):
|
|
1141
1197
|
if idx not in seen_indices:
|
|
1142
1198
|
reranked_nodes.append(node)
|
|
1199
|
+
rerank_scores[id(node)] = min_score
|
|
1143
1200
|
|
|
1144
1201
|
nodes = reranked_nodes[:rerank_limit]
|
|
1145
1202
|
except Exception as e:
|
|
@@ -186,6 +186,50 @@ class TestJiraJqlQuery:
|
|
|
186
186
|
# Query with only stopwords should produce simple or empty query
|
|
187
187
|
assert jql == "" or "ORDER BY updated DESC" in jql
|
|
188
188
|
|
|
189
|
+
def test_issue_key_detection_single(self, base_config):
|
|
190
|
+
"""Single issue key should be detected and searched by key field."""
|
|
191
|
+
jql = _prepare_jira_jql_query("ABEI-1660", base_config)
|
|
192
|
+
assert 'key = "ABEI-1660"' in jql
|
|
193
|
+
assert "ORDER BY updated DESC" in jql
|
|
194
|
+
|
|
195
|
+
def test_issue_key_detection_lowercase(self, base_config):
|
|
196
|
+
"""Lowercase issue key should be normalized to uppercase."""
|
|
197
|
+
jql = _prepare_jira_jql_query("abei-1660", base_config)
|
|
198
|
+
assert 'key = "ABEI-1660"' in jql
|
|
199
|
+
assert "ORDER BY updated DESC" in jql
|
|
200
|
+
|
|
201
|
+
def test_issue_key_detection_multiple(self, base_config):
|
|
202
|
+
"""Multiple issue keys should be detected."""
|
|
203
|
+
jql = _prepare_jira_jql_query("ABEI-1660 PROJ-123", base_config)
|
|
204
|
+
assert 'key = "ABEI-1660"' in jql
|
|
205
|
+
assert 'key = "PROJ-123"' in jql
|
|
206
|
+
assert " OR " in jql
|
|
207
|
+
assert "ORDER BY updated DESC" in jql
|
|
208
|
+
|
|
209
|
+
def test_issue_key_mixed_with_text(self, base_config):
|
|
210
|
+
"""Issue key mixed with text should search both key and text."""
|
|
211
|
+
jql = _prepare_jira_jql_query("ABEI-1660 authentication", base_config)
|
|
212
|
+
assert 'key = "ABEI-1660"' in jql
|
|
213
|
+
assert 'text ~ "authentication"' in jql
|
|
214
|
+
assert " OR " in jql
|
|
215
|
+
assert "ORDER BY updated DESC" in jql
|
|
216
|
+
|
|
217
|
+
def test_no_issue_key_detection(self, base_config):
|
|
218
|
+
"""Non-issue-key queries should work as before."""
|
|
219
|
+
jql = _prepare_jira_jql_query("authentication bug", base_config)
|
|
220
|
+
assert "key =" not in jql # No key search
|
|
221
|
+
assert "text ~" in jql # Text search only
|
|
222
|
+
assert "ORDER BY updated DESC" in jql
|
|
223
|
+
|
|
224
|
+
def test_issue_key_with_project_filter(self, base_config):
|
|
225
|
+
"""Issue key search should respect project filter."""
|
|
226
|
+
base_config["jira"]["projects"] = ["ABEI"]
|
|
227
|
+
jql = _prepare_jira_jql_query("ABEI-1660", base_config)
|
|
228
|
+
assert 'key = "ABEI-1660"' in jql
|
|
229
|
+
assert "project IN" in jql
|
|
230
|
+
assert "ABEI" in jql
|
|
231
|
+
assert "ORDER BY updated DESC" in jql
|
|
232
|
+
|
|
189
233
|
|
|
190
234
|
# ============================================================================
|
|
191
235
|
# ISSUE TO TEXT CONVERSION TESTS
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|