srxy 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- srxy/__init__.py +15 -0
- srxy/cli.py +242 -0
- srxy/core.py +150 -0
- srxy/document_text.py +80 -0
- srxy/dsl.py +6 -0
- srxy/file_search.py +257 -0
- srxy/matchers/base.py +13 -0
- srxy/matchers/composite.py +36 -0
- srxy/matchers/contains.py +10 -0
- srxy/matchers/exact.py +10 -0
- srxy/matchers/fuzzy.py +14 -0
- srxy/matchers/partial.py +12 -0
- srxy/matchers/phonetic.py +54 -0
- srxy/matchers/registry.py +64 -0
- srxy/matchers/semantic.py +87 -0
- srxy/models.py +169 -0
- srxy/utils.py +124 -0
- srxy-1.0.0.dist-info/METADATA +235 -0
- srxy-1.0.0.dist-info/RECORD +23 -0
- srxy-1.0.0.dist-info/WHEEL +5 -0
- srxy-1.0.0.dist-info/entry_points.txt +2 -0
- srxy-1.0.0.dist-info/licenses/LICENSE +21 -0
- srxy-1.0.0.dist-info/top_level.txt +1 -0
srxy/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from srxy.core import magic_search, search
|
|
2
|
+
from srxy.dsl import Q
|
|
3
|
+
from srxy.file_search import magic_file_search
|
|
4
|
+
from srxy.models import FieldConfig, MatchType, SearchResult
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"FieldConfig",
|
|
9
|
+
"MatchType",
|
|
10
|
+
"magic_file_search",
|
|
11
|
+
"magic_search",
|
|
12
|
+
"Q",
|
|
13
|
+
"SearchResult",
|
|
14
|
+
"search",
|
|
15
|
+
]
|
srxy/cli.py
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
from srxy.file_search import magic_file_search, suggest_max_file_size
|
|
9
|
+
from srxy.models import FileSearchResult, SkippedFile
|
|
10
|
+
from srxy.utils import format_match_preview
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
_LOCATION_LABELS = {
|
|
14
|
+
"line": "line",
|
|
15
|
+
"page": "page",
|
|
16
|
+
"paragraph": "paragraph",
|
|
17
|
+
"row": "row",
|
|
18
|
+
"slide": "slide",
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def format_location_label(kind: str, number: int) -> str:
|
|
23
|
+
label = _LOCATION_LABELS.get(kind, kind)
|
|
24
|
+
return f"{label} {number}"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _match_labels(result: FileSearchResult) -> str:
|
|
28
|
+
labels: list[str] = []
|
|
29
|
+
if "name" in result.breakdown and result.breakdown["name"] > 0.0:
|
|
30
|
+
labels.append("name")
|
|
31
|
+
if result.lines or result.breakdown.get("content", 0.0) > 0.0:
|
|
32
|
+
labels.append("content")
|
|
33
|
+
return ", ".join(labels) if labels else "match"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def format_grouped(results: list[FileSearchResult], *, query: str = "") -> str:
|
|
37
|
+
if not results:
|
|
38
|
+
return ""
|
|
39
|
+
|
|
40
|
+
lines: list[str] = []
|
|
41
|
+
match_count = len(results)
|
|
42
|
+
header = f"{match_count} file matched" if match_count == 1 else f"{match_count} files matched"
|
|
43
|
+
if query:
|
|
44
|
+
header += f' for "{query}"'
|
|
45
|
+
lines.append(header)
|
|
46
|
+
|
|
47
|
+
for index, result in enumerate(results):
|
|
48
|
+
if index > 0:
|
|
49
|
+
lines.append("")
|
|
50
|
+
path_text = result.path.as_posix()
|
|
51
|
+
label_text = _match_labels(result)
|
|
52
|
+
lines.append(f"── {path_text} ──")
|
|
53
|
+
lines.append(f" score {result.score:.2f} · matched: {label_text}")
|
|
54
|
+
for line_match in result.lines:
|
|
55
|
+
location = format_location_label(line_match.location_kind, line_match.line_number)
|
|
56
|
+
preview = format_match_preview(line_match.text, query)
|
|
57
|
+
lines.append(f" {location} · score {line_match.score:.2f}")
|
|
58
|
+
lines.append(f" │ {preview}")
|
|
59
|
+
|
|
60
|
+
return "\n".join(lines)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def format_flat(results: list[FileSearchResult]) -> str:
|
|
64
|
+
lines: list[str] = []
|
|
65
|
+
for result in results:
|
|
66
|
+
path_text = result.path.as_posix()
|
|
67
|
+
if result.lines:
|
|
68
|
+
for line_match in result.lines:
|
|
69
|
+
lines.append(
|
|
70
|
+
f"{path_text}:{line_match.location_kind}:{line_match.line_number}:"
|
|
71
|
+
f"{line_match.score:.2f}:{line_match.text}"
|
|
72
|
+
)
|
|
73
|
+
elif "name" in result.breakdown and result.breakdown["name"] > 0.0:
|
|
74
|
+
lines.append(f"{path_text}:name:0:{result.score:.2f}:{result.path.name}")
|
|
75
|
+
return "\n".join(lines)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def format_json(results: list[FileSearchResult], *, query: str = "") -> str:
|
|
79
|
+
payload = [
|
|
80
|
+
{
|
|
81
|
+
"path": result.path.as_posix(),
|
|
82
|
+
"score": result.score,
|
|
83
|
+
"breakdown": result.breakdown,
|
|
84
|
+
"lines": [
|
|
85
|
+
{
|
|
86
|
+
"line_number": line_match.line_number,
|
|
87
|
+
"location_kind": line_match.location_kind,
|
|
88
|
+
"location_label": format_location_label(line_match.location_kind, line_match.line_number),
|
|
89
|
+
"preview": format_match_preview(line_match.text, query),
|
|
90
|
+
"text": line_match.text,
|
|
91
|
+
"score": line_match.score,
|
|
92
|
+
}
|
|
93
|
+
for line_match in result.lines
|
|
94
|
+
],
|
|
95
|
+
}
|
|
96
|
+
for result in results
|
|
97
|
+
]
|
|
98
|
+
return json.dumps(payload, indent=2)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def format_skipped_file_warnings(skipped_files: list[SkippedFile], max_file_size: int) -> str:
|
|
102
|
+
if not skipped_files:
|
|
103
|
+
return ""
|
|
104
|
+
|
|
105
|
+
lines: list[str] = []
|
|
106
|
+
for skipped in skipped_files:
|
|
107
|
+
suggested = suggest_max_file_size(skipped.size_bytes)
|
|
108
|
+
lines.append(
|
|
109
|
+
f"warning: skipped content search in {skipped.path.as_posix()} "
|
|
110
|
+
f"({skipped.size_bytes:,} bytes > --max-file-size {max_file_size:,})"
|
|
111
|
+
)
|
|
112
|
+
lines.append(f" hint: rerun with --max-file-size {suggested}")
|
|
113
|
+
return "\n".join(lines)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
117
|
+
parser = argparse.ArgumentParser(
|
|
118
|
+
prog="srxy",
|
|
119
|
+
description="Fuzzy file and content search using composite matchers.",
|
|
120
|
+
)
|
|
121
|
+
parser.add_argument("query", help="Search string")
|
|
122
|
+
parser.add_argument("path", nargs="?", default=".", help="File or directory to search (default: .)")
|
|
123
|
+
parser.add_argument("--threshold", type=float, default=0.25, help="Minimum match score (default: 0.25)")
|
|
124
|
+
parser.add_argument(
|
|
125
|
+
"--max-file-size",
|
|
126
|
+
type=int,
|
|
127
|
+
default=1_048_576,
|
|
128
|
+
help="Skip content search in files larger than this many bytes (default: 1048576)",
|
|
129
|
+
)
|
|
130
|
+
parser.add_argument(
|
|
131
|
+
"--max-line-matches",
|
|
132
|
+
type=int,
|
|
133
|
+
default=50,
|
|
134
|
+
help="Maximum matching lines returned per file (default: 50)",
|
|
135
|
+
)
|
|
136
|
+
parser.add_argument(
|
|
137
|
+
"--format",
|
|
138
|
+
choices=("grouped", "flat"),
|
|
139
|
+
default="grouped",
|
|
140
|
+
help="Output format for human-readable results (default: grouped)",
|
|
141
|
+
)
|
|
142
|
+
parser.add_argument("--json", action="store_true", help="Emit machine-readable JSON")
|
|
143
|
+
parser.add_argument("--semantic", action="store_true", help="Enable semantic matching (SRXY_SEMANTIC=1)")
|
|
144
|
+
|
|
145
|
+
mode_group = parser.add_mutually_exclusive_group()
|
|
146
|
+
mode_group.add_argument("--names-only", action="store_true", help="Search file names only")
|
|
147
|
+
mode_group.add_argument("--content-only", action="store_true", help="Search file contents only")
|
|
148
|
+
|
|
149
|
+
search_group = parser.add_mutually_exclusive_group()
|
|
150
|
+
search_group.add_argument(
|
|
151
|
+
"--names", action="store_true", dest="search_names", default=None, help="Search file names"
|
|
152
|
+
)
|
|
153
|
+
search_group.add_argument("--no-names", action="store_false", dest="search_names", help="Skip file name search")
|
|
154
|
+
|
|
155
|
+
content_group = parser.add_mutually_exclusive_group()
|
|
156
|
+
content_group.add_argument(
|
|
157
|
+
"--content", action="store_true", dest="search_contents", default=None, help="Search contents"
|
|
158
|
+
)
|
|
159
|
+
content_group.add_argument("--no-content", action="store_false", dest="search_contents", help="Skip content search")
|
|
160
|
+
|
|
161
|
+
parser.add_argument(
|
|
162
|
+
"--include-hidden",
|
|
163
|
+
action="store_true",
|
|
164
|
+
help="Search hidden directories and files (default: skip dot-prefixed entries)",
|
|
165
|
+
)
|
|
166
|
+
parser.add_argument(
|
|
167
|
+
"--include-noise",
|
|
168
|
+
action="store_true",
|
|
169
|
+
help="Search noise directories like __pycache__ and node_modules (default: skip)",
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
return parser
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def resolve_search_modes(args: argparse.Namespace) -> tuple[bool, bool]:
|
|
176
|
+
if args.names_only:
|
|
177
|
+
return True, False
|
|
178
|
+
if args.content_only:
|
|
179
|
+
return False, True
|
|
180
|
+
|
|
181
|
+
search_names = True if args.search_names is None else args.search_names
|
|
182
|
+
search_contents = True if args.search_contents is None else args.search_contents
|
|
183
|
+
return search_names, search_contents
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def render_results(
|
|
187
|
+
results: list[FileSearchResult],
|
|
188
|
+
*,
|
|
189
|
+
as_json: bool,
|
|
190
|
+
output_format: str,
|
|
191
|
+
query: str = "",
|
|
192
|
+
) -> str:
|
|
193
|
+
if as_json:
|
|
194
|
+
return format_json(results, query=query)
|
|
195
|
+
if output_format == "flat":
|
|
196
|
+
return format_flat(results)
|
|
197
|
+
return format_grouped(results, query=query)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def main(argv: list[str] | None = None) -> int:
|
|
201
|
+
parser = build_parser()
|
|
202
|
+
args = parser.parse_args(argv)
|
|
203
|
+
|
|
204
|
+
if args.semantic:
|
|
205
|
+
os.environ["SRXY_SEMANTIC"] = "1"
|
|
206
|
+
|
|
207
|
+
search_names, search_contents = resolve_search_modes(args)
|
|
208
|
+
skipped_files: list[SkippedFile] = []
|
|
209
|
+
|
|
210
|
+
try:
|
|
211
|
+
results = magic_file_search(
|
|
212
|
+
args.path,
|
|
213
|
+
args.query,
|
|
214
|
+
search_names=search_names,
|
|
215
|
+
search_contents=search_contents,
|
|
216
|
+
threshold=args.threshold,
|
|
217
|
+
max_file_size=args.max_file_size,
|
|
218
|
+
max_line_matches=args.max_line_matches,
|
|
219
|
+
skip_hidden_folders=not args.include_hidden,
|
|
220
|
+
skip_noise_folders=not args.include_noise,
|
|
221
|
+
skipped_files=skipped_files if search_contents else None,
|
|
222
|
+
)
|
|
223
|
+
except FileNotFoundError as error:
|
|
224
|
+
print(error, file=sys.stderr)
|
|
225
|
+
return 2
|
|
226
|
+
except ValueError as error:
|
|
227
|
+
print(error, file=sys.stderr)
|
|
228
|
+
return 2
|
|
229
|
+
|
|
230
|
+
warnings = format_skipped_file_warnings(skipped_files, args.max_file_size)
|
|
231
|
+
if warnings:
|
|
232
|
+
print(warnings, file=sys.stderr)
|
|
233
|
+
|
|
234
|
+
output = render_results(results, as_json=args.json, output_format=args.format, query=args.query)
|
|
235
|
+
if output:
|
|
236
|
+
print(output)
|
|
237
|
+
|
|
238
|
+
return 0 if results else 1
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
if __name__ == "__main__":
|
|
242
|
+
sys.exit(main())
|
srxy/core.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from srxy.matchers.registry import get_matcher
|
|
6
|
+
from srxy.models import FieldConfig, MatchType, Q, QNodeType, SearchResult
|
|
7
|
+
from srxy.utils import discover_fields, get_field_value, normalize_text
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
_DEFAULT_MAGIC_SEARCH_FIELDS: list[str] = []
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _score_field(
|
|
14
|
+
item: Any,
|
|
15
|
+
query: str,
|
|
16
|
+
field: str,
|
|
17
|
+
match_type: MatchType,
|
|
18
|
+
composite_weights: dict[MatchType, float] | None,
|
|
19
|
+
) -> tuple[float, Any]:
|
|
20
|
+
value = normalize_text(get_field_value(item, field))
|
|
21
|
+
matcher = get_matcher(match_type, composite_weights)
|
|
22
|
+
if hasattr(matcher, "score_with_breakdown"):
|
|
23
|
+
score, sub_breakdown = matcher.score_with_breakdown(query, value)
|
|
24
|
+
if sub_breakdown:
|
|
25
|
+
return score, sub_breakdown
|
|
26
|
+
return score, score
|
|
27
|
+
return matcher.score(query, value), matcher.score(query, value)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _evaluate_leaf(item: Any, query: str, leaf: Q) -> tuple[float, dict[str, Any]]:
|
|
31
|
+
if leaf.field is None:
|
|
32
|
+
return 0.0, {}
|
|
33
|
+
|
|
34
|
+
score, detail = _score_field(item, query, leaf.field, leaf.match, leaf.composite_weights)
|
|
35
|
+
return score, {leaf.field: detail}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _evaluate_q(item: Any, query: str, expr: Q) -> tuple[float, dict[str, Any]]:
|
|
39
|
+
if expr.node_type == QNodeType.LEAF:
|
|
40
|
+
return _evaluate_leaf(item, query, expr)
|
|
41
|
+
|
|
42
|
+
child_results = [_evaluate_q(item, query, child) for child in expr.children]
|
|
43
|
+
child_scores = [score for score, _ in child_results]
|
|
44
|
+
breakdown = {}
|
|
45
|
+
for _, child_breakdown in child_results:
|
|
46
|
+
breakdown.update(child_breakdown)
|
|
47
|
+
|
|
48
|
+
if not child_scores:
|
|
49
|
+
return 0.0, breakdown
|
|
50
|
+
|
|
51
|
+
if expr.node_type == QNodeType.AND:
|
|
52
|
+
return min(child_scores), breakdown
|
|
53
|
+
return max(child_scores), breakdown
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _evaluate_fields(
|
|
57
|
+
item: Any,
|
|
58
|
+
query: str,
|
|
59
|
+
fields: list[FieldConfig],
|
|
60
|
+
require_all: bool,
|
|
61
|
+
threshold: float,
|
|
62
|
+
) -> tuple[float, dict[str, Any], bool]:
|
|
63
|
+
field_scores: list[tuple[float, float]] = []
|
|
64
|
+
breakdown: dict[str, Any] = {}
|
|
65
|
+
|
|
66
|
+
for field_config in fields:
|
|
67
|
+
score, detail = _score_field(
|
|
68
|
+
item,
|
|
69
|
+
query,
|
|
70
|
+
field_config.name,
|
|
71
|
+
field_config.match,
|
|
72
|
+
field_config.composite_weights,
|
|
73
|
+
)
|
|
74
|
+
field_scores.append((score, field_config.weight))
|
|
75
|
+
breakdown[field_config.name] = detail
|
|
76
|
+
|
|
77
|
+
if not field_scores:
|
|
78
|
+
return 0.0, breakdown, False
|
|
79
|
+
|
|
80
|
+
if require_all and any(score <= 0.0 for score, _ in field_scores):
|
|
81
|
+
return 0.0, breakdown, False
|
|
82
|
+
|
|
83
|
+
if require_all and threshold > 0.0 and any(score < threshold for score, _ in field_scores):
|
|
84
|
+
return 0.0, breakdown, False
|
|
85
|
+
|
|
86
|
+
total_weight = sum(weight for _, weight in field_scores)
|
|
87
|
+
if total_weight == 0.0:
|
|
88
|
+
return 0.0, breakdown, False
|
|
89
|
+
|
|
90
|
+
weighted_score = sum(score * weight for score, weight in field_scores) / total_weight
|
|
91
|
+
return weighted_score, breakdown, True
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def search(
|
|
95
|
+
items: list[Any],
|
|
96
|
+
query: str,
|
|
97
|
+
*,
|
|
98
|
+
fields: list[FieldConfig] | None = None,
|
|
99
|
+
where: Q | None = None,
|
|
100
|
+
threshold: float = 0.0,
|
|
101
|
+
require_all: bool = False,
|
|
102
|
+
) -> list[SearchResult]:
|
|
103
|
+
if fields is not None and where is not None:
|
|
104
|
+
raise ValueError("Provide either 'fields' or 'where', not both")
|
|
105
|
+
if fields is None and where is None:
|
|
106
|
+
raise ValueError("Provide either 'fields' or 'where'")
|
|
107
|
+
|
|
108
|
+
normalized_query = normalize_text(query)
|
|
109
|
+
if not normalized_query:
|
|
110
|
+
return []
|
|
111
|
+
|
|
112
|
+
results: list[SearchResult] = []
|
|
113
|
+
|
|
114
|
+
for item in items:
|
|
115
|
+
if where is not None:
|
|
116
|
+
score, breakdown = _evaluate_q(item, normalized_query, where)
|
|
117
|
+
if score <= 0.0 or score < threshold:
|
|
118
|
+
continue
|
|
119
|
+
results.append(SearchResult(item=item, score=score, breakdown=breakdown))
|
|
120
|
+
elif fields is not None:
|
|
121
|
+
score, breakdown, include = _evaluate_fields(
|
|
122
|
+
item,
|
|
123
|
+
normalized_query,
|
|
124
|
+
fields,
|
|
125
|
+
require_all,
|
|
126
|
+
threshold,
|
|
127
|
+
)
|
|
128
|
+
if not include or score <= 0.0 or score < threshold:
|
|
129
|
+
continue
|
|
130
|
+
results.append(SearchResult(item=item, score=score, breakdown=breakdown))
|
|
131
|
+
|
|
132
|
+
results.sort(key=lambda result: result.score, reverse=True)
|
|
133
|
+
return results
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def magic_search(
|
|
137
|
+
items: list[Any],
|
|
138
|
+
query: str,
|
|
139
|
+
*,
|
|
140
|
+
fields: list[str] = _DEFAULT_MAGIC_SEARCH_FIELDS,
|
|
141
|
+
threshold: float = 0.25,
|
|
142
|
+
) -> list[SearchResult]:
|
|
143
|
+
field_names = fields
|
|
144
|
+
if not field_names:
|
|
145
|
+
field_names = discover_fields(items)
|
|
146
|
+
if not field_names:
|
|
147
|
+
return []
|
|
148
|
+
|
|
149
|
+
where = Q.any(*(Q.composite(field) for field in field_names))
|
|
150
|
+
return search(items, query, where=where, threshold=threshold)
|
srxy/document_text.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Callable, Iterator
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
DOCUMENT_SUFFIXES = frozenset({".pdf", ".docx", ".xlsx", ".pptx"})
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def is_document_path(path: Path) -> bool:
|
|
11
|
+
return path.suffix.lower() in DOCUMENT_SUFFIXES
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def iter_document_lines(path: Path) -> Iterator[tuple[int, str]]:
|
|
15
|
+
suffix = path.suffix.lower()
|
|
16
|
+
extractors: dict[str, Callable[[Path], Iterator[tuple[int, str]]]] = {
|
|
17
|
+
".pdf": _iter_pdf_lines,
|
|
18
|
+
".docx": _iter_docx_lines,
|
|
19
|
+
".xlsx": _iter_xlsx_lines,
|
|
20
|
+
".pptx": _iter_pptx_lines,
|
|
21
|
+
}
|
|
22
|
+
extractor = extractors.get(suffix)
|
|
23
|
+
if extractor is None:
|
|
24
|
+
return
|
|
25
|
+
try:
|
|
26
|
+
yield from extractor(path)
|
|
27
|
+
except Exception:
|
|
28
|
+
return
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _iter_pdf_lines(path: Path) -> Iterator[tuple[int, str]]:
|
|
32
|
+
from pypdf import PdfReader
|
|
33
|
+
|
|
34
|
+
reader = PdfReader(path)
|
|
35
|
+
for page_number, page in enumerate(reader.pages, start=1):
|
|
36
|
+
text = page.extract_text() or ""
|
|
37
|
+
text = text.strip()
|
|
38
|
+
if text:
|
|
39
|
+
yield page_number, text
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _iter_docx_lines(path: Path) -> Iterator[tuple[int, str]]:
|
|
43
|
+
from docx import Document
|
|
44
|
+
|
|
45
|
+
document = Document(str(path))
|
|
46
|
+
for paragraph_number, paragraph in enumerate(document.paragraphs, start=1):
|
|
47
|
+
text = paragraph.text.strip()
|
|
48
|
+
if text:
|
|
49
|
+
yield paragraph_number, text
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _iter_xlsx_lines(path: Path) -> Iterator[tuple[int, str]]:
|
|
53
|
+
from openpyxl import load_workbook
|
|
54
|
+
|
|
55
|
+
workbook = load_workbook(path, read_only=True, data_only=True)
|
|
56
|
+
try:
|
|
57
|
+
line_number = 0
|
|
58
|
+
for sheet in workbook.worksheets:
|
|
59
|
+
for row in sheet.iter_rows(values_only=True):
|
|
60
|
+
cells = [str(cell) for cell in row if cell is not None and str(cell).strip()]
|
|
61
|
+
if not cells:
|
|
62
|
+
continue
|
|
63
|
+
line_number += 1
|
|
64
|
+
yield line_number, f"[{sheet.title}] " + " ".join(cells)
|
|
65
|
+
finally:
|
|
66
|
+
workbook.close()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _iter_pptx_lines(path: Path) -> Iterator[tuple[int, str]]:
|
|
70
|
+
from pptx import Presentation
|
|
71
|
+
|
|
72
|
+
presentation = Presentation(str(path))
|
|
73
|
+
for slide_number, slide in enumerate(presentation.slides, start=1):
|
|
74
|
+
parts: list[str] = []
|
|
75
|
+
for shape in slide.shapes:
|
|
76
|
+
text = shape.text.strip() if hasattr(shape, "text") else ""
|
|
77
|
+
if text:
|
|
78
|
+
parts.append(text)
|
|
79
|
+
if parts:
|
|
80
|
+
yield slide_number, " ".join(parts)
|