srxy 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
srxy/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ from srxy.core import magic_search, search
2
+ from srxy.dsl import Q
3
+ from srxy.file_search import magic_file_search
4
+ from srxy.models import FieldConfig, MatchType, SearchResult
5
+
6
+
7
+ __all__ = [
8
+ "FieldConfig",
9
+ "MatchType",
10
+ "magic_file_search",
11
+ "magic_search",
12
+ "Q",
13
+ "SearchResult",
14
+ "search",
15
+ ]
srxy/cli.py ADDED
@@ -0,0 +1,242 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import os
6
+ import sys
7
+
8
+ from srxy.file_search import magic_file_search, suggest_max_file_size
9
+ from srxy.models import FileSearchResult, SkippedFile
10
+ from srxy.utils import format_match_preview
11
+
12
+
13
+ _LOCATION_LABELS = {
14
+ "line": "line",
15
+ "page": "page",
16
+ "paragraph": "paragraph",
17
+ "row": "row",
18
+ "slide": "slide",
19
+ }
20
+
21
+
22
+ def format_location_label(kind: str, number: int) -> str:
23
+ label = _LOCATION_LABELS.get(kind, kind)
24
+ return f"{label} {number}"
25
+
26
+
27
+ def _match_labels(result: FileSearchResult) -> str:
28
+ labels: list[str] = []
29
+ if "name" in result.breakdown and result.breakdown["name"] > 0.0:
30
+ labels.append("name")
31
+ if result.lines or result.breakdown.get("content", 0.0) > 0.0:
32
+ labels.append("content")
33
+ return ", ".join(labels) if labels else "match"
34
+
35
+
36
+ def format_grouped(results: list[FileSearchResult], *, query: str = "") -> str:
37
+ if not results:
38
+ return ""
39
+
40
+ lines: list[str] = []
41
+ match_count = len(results)
42
+ header = f"{match_count} file matched" if match_count == 1 else f"{match_count} files matched"
43
+ if query:
44
+ header += f' for "{query}"'
45
+ lines.append(header)
46
+
47
+ for index, result in enumerate(results):
48
+ if index > 0:
49
+ lines.append("")
50
+ path_text = result.path.as_posix()
51
+ label_text = _match_labels(result)
52
+ lines.append(f"── {path_text} ──")
53
+ lines.append(f" score {result.score:.2f} · matched: {label_text}")
54
+ for line_match in result.lines:
55
+ location = format_location_label(line_match.location_kind, line_match.line_number)
56
+ preview = format_match_preview(line_match.text, query)
57
+ lines.append(f" {location} · score {line_match.score:.2f}")
58
+ lines.append(f" │ {preview}")
59
+
60
+ return "\n".join(lines)
61
+
62
+
63
+ def format_flat(results: list[FileSearchResult]) -> str:
64
+ lines: list[str] = []
65
+ for result in results:
66
+ path_text = result.path.as_posix()
67
+ if result.lines:
68
+ for line_match in result.lines:
69
+ lines.append(
70
+ f"{path_text}:{line_match.location_kind}:{line_match.line_number}:"
71
+ f"{line_match.score:.2f}:{line_match.text}"
72
+ )
73
+ elif "name" in result.breakdown and result.breakdown["name"] > 0.0:
74
+ lines.append(f"{path_text}:name:0:{result.score:.2f}:{result.path.name}")
75
+ return "\n".join(lines)
76
+
77
+
78
+ def format_json(results: list[FileSearchResult], *, query: str = "") -> str:
79
+ payload = [
80
+ {
81
+ "path": result.path.as_posix(),
82
+ "score": result.score,
83
+ "breakdown": result.breakdown,
84
+ "lines": [
85
+ {
86
+ "line_number": line_match.line_number,
87
+ "location_kind": line_match.location_kind,
88
+ "location_label": format_location_label(line_match.location_kind, line_match.line_number),
89
+ "preview": format_match_preview(line_match.text, query),
90
+ "text": line_match.text,
91
+ "score": line_match.score,
92
+ }
93
+ for line_match in result.lines
94
+ ],
95
+ }
96
+ for result in results
97
+ ]
98
+ return json.dumps(payload, indent=2)
99
+
100
+
101
+ def format_skipped_file_warnings(skipped_files: list[SkippedFile], max_file_size: int) -> str:
102
+ if not skipped_files:
103
+ return ""
104
+
105
+ lines: list[str] = []
106
+ for skipped in skipped_files:
107
+ suggested = suggest_max_file_size(skipped.size_bytes)
108
+ lines.append(
109
+ f"warning: skipped content search in {skipped.path.as_posix()} "
110
+ f"({skipped.size_bytes:,} bytes > --max-file-size {max_file_size:,})"
111
+ )
112
+ lines.append(f" hint: rerun with --max-file-size {suggested}")
113
+ return "\n".join(lines)
114
+
115
+
116
+ def build_parser() -> argparse.ArgumentParser:
117
+ parser = argparse.ArgumentParser(
118
+ prog="srxy",
119
+ description="Fuzzy file and content search using composite matchers.",
120
+ )
121
+ parser.add_argument("query", help="Search string")
122
+ parser.add_argument("path", nargs="?", default=".", help="File or directory to search (default: .)")
123
+ parser.add_argument("--threshold", type=float, default=0.25, help="Minimum match score (default: 0.25)")
124
+ parser.add_argument(
125
+ "--max-file-size",
126
+ type=int,
127
+ default=1_048_576,
128
+ help="Skip content search in files larger than this many bytes (default: 1048576)",
129
+ )
130
+ parser.add_argument(
131
+ "--max-line-matches",
132
+ type=int,
133
+ default=50,
134
+ help="Maximum matching lines returned per file (default: 50)",
135
+ )
136
+ parser.add_argument(
137
+ "--format",
138
+ choices=("grouped", "flat"),
139
+ default="grouped",
140
+ help="Output format for human-readable results (default: grouped)",
141
+ )
142
+ parser.add_argument("--json", action="store_true", help="Emit machine-readable JSON")
143
+ parser.add_argument("--semantic", action="store_true", help="Enable semantic matching (SRXY_SEMANTIC=1)")
144
+
145
+ mode_group = parser.add_mutually_exclusive_group()
146
+ mode_group.add_argument("--names-only", action="store_true", help="Search file names only")
147
+ mode_group.add_argument("--content-only", action="store_true", help="Search file contents only")
148
+
149
+ search_group = parser.add_mutually_exclusive_group()
150
+ search_group.add_argument(
151
+ "--names", action="store_true", dest="search_names", default=None, help="Search file names"
152
+ )
153
+ search_group.add_argument("--no-names", action="store_false", dest="search_names", help="Skip file name search")
154
+
155
+ content_group = parser.add_mutually_exclusive_group()
156
+ content_group.add_argument(
157
+ "--content", action="store_true", dest="search_contents", default=None, help="Search contents"
158
+ )
159
+ content_group.add_argument("--no-content", action="store_false", dest="search_contents", help="Skip content search")
160
+
161
+ parser.add_argument(
162
+ "--include-hidden",
163
+ action="store_true",
164
+ help="Search hidden directories and files (default: skip dot-prefixed entries)",
165
+ )
166
+ parser.add_argument(
167
+ "--include-noise",
168
+ action="store_true",
169
+ help="Search noise directories like __pycache__ and node_modules (default: skip)",
170
+ )
171
+
172
+ return parser
173
+
174
+
175
+ def resolve_search_modes(args: argparse.Namespace) -> tuple[bool, bool]:
176
+ if args.names_only:
177
+ return True, False
178
+ if args.content_only:
179
+ return False, True
180
+
181
+ search_names = True if args.search_names is None else args.search_names
182
+ search_contents = True if args.search_contents is None else args.search_contents
183
+ return search_names, search_contents
184
+
185
+
186
+ def render_results(
187
+ results: list[FileSearchResult],
188
+ *,
189
+ as_json: bool,
190
+ output_format: str,
191
+ query: str = "",
192
+ ) -> str:
193
+ if as_json:
194
+ return format_json(results, query=query)
195
+ if output_format == "flat":
196
+ return format_flat(results)
197
+ return format_grouped(results, query=query)
198
+
199
+
200
+ def main(argv: list[str] | None = None) -> int:
201
+ parser = build_parser()
202
+ args = parser.parse_args(argv)
203
+
204
+ if args.semantic:
205
+ os.environ["SRXY_SEMANTIC"] = "1"
206
+
207
+ search_names, search_contents = resolve_search_modes(args)
208
+ skipped_files: list[SkippedFile] = []
209
+
210
+ try:
211
+ results = magic_file_search(
212
+ args.path,
213
+ args.query,
214
+ search_names=search_names,
215
+ search_contents=search_contents,
216
+ threshold=args.threshold,
217
+ max_file_size=args.max_file_size,
218
+ max_line_matches=args.max_line_matches,
219
+ skip_hidden_folders=not args.include_hidden,
220
+ skip_noise_folders=not args.include_noise,
221
+ skipped_files=skipped_files if search_contents else None,
222
+ )
223
+ except FileNotFoundError as error:
224
+ print(error, file=sys.stderr)
225
+ return 2
226
+ except ValueError as error:
227
+ print(error, file=sys.stderr)
228
+ return 2
229
+
230
+ warnings = format_skipped_file_warnings(skipped_files, args.max_file_size)
231
+ if warnings:
232
+ print(warnings, file=sys.stderr)
233
+
234
+ output = render_results(results, as_json=args.json, output_format=args.format, query=args.query)
235
+ if output:
236
+ print(output)
237
+
238
+ return 0 if results else 1
239
+
240
+
241
+ if __name__ == "__main__":
242
+ sys.exit(main())
srxy/core.py ADDED
@@ -0,0 +1,150 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from srxy.matchers.registry import get_matcher
6
+ from srxy.models import FieldConfig, MatchType, Q, QNodeType, SearchResult
7
+ from srxy.utils import discover_fields, get_field_value, normalize_text
8
+
9
+
10
+ _DEFAULT_MAGIC_SEARCH_FIELDS: list[str] = []
11
+
12
+
13
+ def _score_field(
14
+ item: Any,
15
+ query: str,
16
+ field: str,
17
+ match_type: MatchType,
18
+ composite_weights: dict[MatchType, float] | None,
19
+ ) -> tuple[float, Any]:
20
+ value = normalize_text(get_field_value(item, field))
21
+ matcher = get_matcher(match_type, composite_weights)
22
+ if hasattr(matcher, "score_with_breakdown"):
23
+ score, sub_breakdown = matcher.score_with_breakdown(query, value)
24
+ if sub_breakdown:
25
+ return score, sub_breakdown
26
+ return score, score
27
+ return matcher.score(query, value), matcher.score(query, value)
28
+
29
+
30
+ def _evaluate_leaf(item: Any, query: str, leaf: Q) -> tuple[float, dict[str, Any]]:
31
+ if leaf.field is None:
32
+ return 0.0, {}
33
+
34
+ score, detail = _score_field(item, query, leaf.field, leaf.match, leaf.composite_weights)
35
+ return score, {leaf.field: detail}
36
+
37
+
38
+ def _evaluate_q(item: Any, query: str, expr: Q) -> tuple[float, dict[str, Any]]:
39
+ if expr.node_type == QNodeType.LEAF:
40
+ return _evaluate_leaf(item, query, expr)
41
+
42
+ child_results = [_evaluate_q(item, query, child) for child in expr.children]
43
+ child_scores = [score for score, _ in child_results]
44
+ breakdown = {}
45
+ for _, child_breakdown in child_results:
46
+ breakdown.update(child_breakdown)
47
+
48
+ if not child_scores:
49
+ return 0.0, breakdown
50
+
51
+ if expr.node_type == QNodeType.AND:
52
+ return min(child_scores), breakdown
53
+ return max(child_scores), breakdown
54
+
55
+
56
+ def _evaluate_fields(
57
+ item: Any,
58
+ query: str,
59
+ fields: list[FieldConfig],
60
+ require_all: bool,
61
+ threshold: float,
62
+ ) -> tuple[float, dict[str, Any], bool]:
63
+ field_scores: list[tuple[float, float]] = []
64
+ breakdown: dict[str, Any] = {}
65
+
66
+ for field_config in fields:
67
+ score, detail = _score_field(
68
+ item,
69
+ query,
70
+ field_config.name,
71
+ field_config.match,
72
+ field_config.composite_weights,
73
+ )
74
+ field_scores.append((score, field_config.weight))
75
+ breakdown[field_config.name] = detail
76
+
77
+ if not field_scores:
78
+ return 0.0, breakdown, False
79
+
80
+ if require_all and any(score <= 0.0 for score, _ in field_scores):
81
+ return 0.0, breakdown, False
82
+
83
+ if require_all and threshold > 0.0 and any(score < threshold for score, _ in field_scores):
84
+ return 0.0, breakdown, False
85
+
86
+ total_weight = sum(weight for _, weight in field_scores)
87
+ if total_weight == 0.0:
88
+ return 0.0, breakdown, False
89
+
90
+ weighted_score = sum(score * weight for score, weight in field_scores) / total_weight
91
+ return weighted_score, breakdown, True
92
+
93
+
94
+ def search(
95
+ items: list[Any],
96
+ query: str,
97
+ *,
98
+ fields: list[FieldConfig] | None = None,
99
+ where: Q | None = None,
100
+ threshold: float = 0.0,
101
+ require_all: bool = False,
102
+ ) -> list[SearchResult]:
103
+ if fields is not None and where is not None:
104
+ raise ValueError("Provide either 'fields' or 'where', not both")
105
+ if fields is None and where is None:
106
+ raise ValueError("Provide either 'fields' or 'where'")
107
+
108
+ normalized_query = normalize_text(query)
109
+ if not normalized_query:
110
+ return []
111
+
112
+ results: list[SearchResult] = []
113
+
114
+ for item in items:
115
+ if where is not None:
116
+ score, breakdown = _evaluate_q(item, normalized_query, where)
117
+ if score <= 0.0 or score < threshold:
118
+ continue
119
+ results.append(SearchResult(item=item, score=score, breakdown=breakdown))
120
+ elif fields is not None:
121
+ score, breakdown, include = _evaluate_fields(
122
+ item,
123
+ normalized_query,
124
+ fields,
125
+ require_all,
126
+ threshold,
127
+ )
128
+ if not include or score <= 0.0 or score < threshold:
129
+ continue
130
+ results.append(SearchResult(item=item, score=score, breakdown=breakdown))
131
+
132
+ results.sort(key=lambda result: result.score, reverse=True)
133
+ return results
134
+
135
+
136
+ def magic_search(
137
+ items: list[Any],
138
+ query: str,
139
+ *,
140
+ fields: list[str] = _DEFAULT_MAGIC_SEARCH_FIELDS,
141
+ threshold: float = 0.25,
142
+ ) -> list[SearchResult]:
143
+ field_names = fields
144
+ if not field_names:
145
+ field_names = discover_fields(items)
146
+ if not field_names:
147
+ return []
148
+
149
+ where = Q.any(*(Q.composite(field) for field in field_names))
150
+ return search(items, query, where=where, threshold=threshold)
srxy/document_text.py ADDED
@@ -0,0 +1,80 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Callable, Iterator
4
+ from pathlib import Path
5
+
6
+
7
+ DOCUMENT_SUFFIXES = frozenset({".pdf", ".docx", ".xlsx", ".pptx"})
8
+
9
+
10
+ def is_document_path(path: Path) -> bool:
11
+ return path.suffix.lower() in DOCUMENT_SUFFIXES
12
+
13
+
14
+ def iter_document_lines(path: Path) -> Iterator[tuple[int, str]]:
15
+ suffix = path.suffix.lower()
16
+ extractors: dict[str, Callable[[Path], Iterator[tuple[int, str]]]] = {
17
+ ".pdf": _iter_pdf_lines,
18
+ ".docx": _iter_docx_lines,
19
+ ".xlsx": _iter_xlsx_lines,
20
+ ".pptx": _iter_pptx_lines,
21
+ }
22
+ extractor = extractors.get(suffix)
23
+ if extractor is None:
24
+ return
25
+ try:
26
+ yield from extractor(path)
27
+ except Exception:
28
+ return
29
+
30
+
31
+ def _iter_pdf_lines(path: Path) -> Iterator[tuple[int, str]]:
32
+ from pypdf import PdfReader
33
+
34
+ reader = PdfReader(path)
35
+ for page_number, page in enumerate(reader.pages, start=1):
36
+ text = page.extract_text() or ""
37
+ text = text.strip()
38
+ if text:
39
+ yield page_number, text
40
+
41
+
42
+ def _iter_docx_lines(path: Path) -> Iterator[tuple[int, str]]:
43
+ from docx import Document
44
+
45
+ document = Document(str(path))
46
+ for paragraph_number, paragraph in enumerate(document.paragraphs, start=1):
47
+ text = paragraph.text.strip()
48
+ if text:
49
+ yield paragraph_number, text
50
+
51
+
52
+ def _iter_xlsx_lines(path: Path) -> Iterator[tuple[int, str]]:
53
+ from openpyxl import load_workbook
54
+
55
+ workbook = load_workbook(path, read_only=True, data_only=True)
56
+ try:
57
+ line_number = 0
58
+ for sheet in workbook.worksheets:
59
+ for row in sheet.iter_rows(values_only=True):
60
+ cells = [str(cell) for cell in row if cell is not None and str(cell).strip()]
61
+ if not cells:
62
+ continue
63
+ line_number += 1
64
+ yield line_number, f"[{sheet.title}] " + " ".join(cells)
65
+ finally:
66
+ workbook.close()
67
+
68
+
69
+ def _iter_pptx_lines(path: Path) -> Iterator[tuple[int, str]]:
70
+ from pptx import Presentation
71
+
72
+ presentation = Presentation(str(path))
73
+ for slide_number, slide in enumerate(presentation.slides, start=1):
74
+ parts: list[str] = []
75
+ for shape in slide.shapes:
76
+ text = shape.text.strip() if hasattr(shape, "text") else ""
77
+ if text:
78
+ parts.append(text)
79
+ if parts:
80
+ yield slide_number, " ".join(parts)
srxy/dsl.py ADDED
@@ -0,0 +1,6 @@
1
+ """Query DSL — re-exports the Q expression class."""
2
+
3
+ from srxy.models import Q
4
+
5
+
6
+ __all__ = ["Q"]