hmdev-cli 1.0.4 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/python/cli.py +101 -14
- package/scripts/runner.js +2 -2
package/package.json
CHANGED
package/python/cli.py
CHANGED
|
@@ -25,6 +25,8 @@ from html.parser import HTMLParser
|
|
|
25
25
|
from typing import Any
|
|
26
26
|
|
|
27
27
|
import httpx
|
|
28
|
+
from rapidfuzz import fuzz
|
|
29
|
+
import jieba
|
|
28
30
|
|
|
29
31
|
from builder import HvigorTool, HDCTool
|
|
30
32
|
from config import Config
|
|
@@ -295,6 +297,67 @@ async def build_index() -> dict[str, Any]:
|
|
|
295
297
|
return result
|
|
296
298
|
|
|
297
299
|
|
|
300
|
+
# ── Search Ranking ─────────────────────────────────────────────────────────────
|
|
301
|
+
|
|
302
|
+
def compute_relevance_score(query: str, title: str, object_id: str, catalog_name: str) -> float:
|
|
303
|
+
"""
|
|
304
|
+
Compute a relevance score for a document against the query.
|
|
305
|
+
|
|
306
|
+
Combines:
|
|
307
|
+
- Exact substring/title/ID/category match (highest weight)
|
|
308
|
+
- Fuzzy string matching via rapidfuzz (partial_ratio, token_sort, token_set)
|
|
309
|
+
- Chinese word segmentation overlap via jieba (semantic-like matching)
|
|
310
|
+
- Word prefix bonus
|
|
311
|
+
|
|
312
|
+
Higher score = more relevant.
|
|
313
|
+
"""
|
|
314
|
+
q = query.lower().strip()
|
|
315
|
+
t = title.lower()
|
|
316
|
+
o = object_id.lower()
|
|
317
|
+
c = catalog_name.lower()
|
|
318
|
+
|
|
319
|
+
score = 0.0
|
|
320
|
+
|
|
321
|
+
# ── 1. Exact match (strongest signal) ──
|
|
322
|
+
if t == q:
|
|
323
|
+
score += 5.0
|
|
324
|
+
elif t.startswith(q):
|
|
325
|
+
score += 4.0
|
|
326
|
+
elif q in t:
|
|
327
|
+
score += 3.0
|
|
328
|
+
|
|
329
|
+
if q in o:
|
|
330
|
+
score += 1.5
|
|
331
|
+
if q in c:
|
|
332
|
+
score += 0.5
|
|
333
|
+
|
|
334
|
+
# ── 2. Fuzzy match on title ──
|
|
335
|
+
score += (fuzz.partial_ratio(q, t) / 100.0) * 2.0
|
|
336
|
+
score += (fuzz.token_sort_ratio(q, t) / 100.0) * 1.5
|
|
337
|
+
score += (fuzz.token_set_ratio(q, t) / 100.0) * 1.0
|
|
338
|
+
|
|
339
|
+
# ── 3. Fuzzy match on object_id ──
|
|
340
|
+
score += (fuzz.partial_ratio(q, o) / 100.0) * 0.8
|
|
341
|
+
|
|
342
|
+
# ── 4. Word overlap via jieba (handles Chinese segmentation) ──
|
|
343
|
+
q_words = set(w for w in jieba.lcut(q) if w.strip())
|
|
344
|
+
t_words = set(w for w in jieba.lcut(t) if w.strip())
|
|
345
|
+
if q_words and t_words:
|
|
346
|
+
common = q_words & t_words
|
|
347
|
+
score += (len(common) / len(q_words)) * 2.0
|
|
348
|
+
|
|
349
|
+
# ── 5. Prefix match: query word is prefix of title word ──
|
|
350
|
+
for qw in q_words:
|
|
351
|
+
if len(qw) < 2:
|
|
352
|
+
continue
|
|
353
|
+
for tw in t_words:
|
|
354
|
+
if tw != qw and tw.startswith(qw):
|
|
355
|
+
score += 0.5
|
|
356
|
+
break
|
|
357
|
+
|
|
358
|
+
return score
|
|
359
|
+
|
|
360
|
+
|
|
298
361
|
# ── CLI Helpers ────────────────────────────────────────────────────────────────
|
|
299
362
|
|
|
300
363
|
def parse_doc_url(url: str) -> tuple[str, str]:
|
|
@@ -345,34 +408,58 @@ async def cmd_index(args):
|
|
|
345
408
|
|
|
346
409
|
async def cmd_search(args):
|
|
347
410
|
index = await build_index()
|
|
348
|
-
|
|
349
|
-
|
|
411
|
+
query = args.query.strip()
|
|
412
|
+
if not query:
|
|
413
|
+
print("请提供搜索关键词。")
|
|
414
|
+
return
|
|
415
|
+
|
|
416
|
+
query_lower = query.lower()
|
|
350
417
|
seen = set()
|
|
418
|
+
scored = []
|
|
351
419
|
|
|
352
420
|
for page in index.get("all_pages", []):
|
|
353
|
-
title = page.get("title", "")
|
|
354
|
-
obj_id = page.get("object_id", "")
|
|
355
|
-
|
|
421
|
+
title = page.get("title", "")
|
|
422
|
+
obj_id = page.get("object_id", "")
|
|
423
|
+
catalog = page.get("catalog_name", "")
|
|
424
|
+
|
|
425
|
+
score = compute_relevance_score(query, title, obj_id, catalog)
|
|
426
|
+
|
|
427
|
+
# Include if exact match exists or fuzzy score is significant
|
|
428
|
+
if (query_lower in title.lower()
|
|
429
|
+
or query_lower in obj_id.lower()
|
|
430
|
+
or query_lower in catalog.lower()
|
|
431
|
+
or score >= 1.5):
|
|
356
432
|
if page.get("url") not in seen:
|
|
357
433
|
seen.add(page["url"])
|
|
358
|
-
|
|
434
|
+
page = dict(page)
|
|
435
|
+
page["_score"] = round(score, 2)
|
|
436
|
+
scored.append(page)
|
|
437
|
+
|
|
438
|
+
# Sort: higher score first, shorter title as tiebreaker
|
|
439
|
+
scored.sort(key=lambda p: (-p["_score"], len(p.get("title", ""))))
|
|
359
440
|
|
|
360
441
|
if args.json:
|
|
361
|
-
|
|
442
|
+
out = {
|
|
443
|
+
"query": args.query,
|
|
444
|
+
"total": len(scored),
|
|
445
|
+
"results": scored[:50],
|
|
446
|
+
}
|
|
447
|
+
print_json(out)
|
|
362
448
|
return
|
|
363
449
|
|
|
364
|
-
if not
|
|
450
|
+
if not scored:
|
|
365
451
|
print(f"未找到与 '{args.query}' 相关的文档。")
|
|
366
452
|
print(f"可用分类: {', '.join(f'{v}({k})' for k, v in CATALOGS.items())}")
|
|
367
453
|
return
|
|
368
454
|
|
|
369
|
-
print(f"搜索结果: '{args.query}' (共 {len(
|
|
370
|
-
for page in
|
|
455
|
+
print(f"搜索结果: '{args.query}' (共 {len(scored)} 篇)\n")
|
|
456
|
+
for page in scored[:30]:
|
|
371
457
|
cat = CATALOGS.get(page.get("catalog_name", ""), page.get("catalog_name", ""))
|
|
372
|
-
|
|
373
|
-
print(f"
|
|
374
|
-
|
|
375
|
-
|
|
458
|
+
bar = "█" * min(int(page["_score"]), 10) + "░" * (10 - min(int(page["_score"]), 10))
|
|
459
|
+
print(f" {bar} [{cat}] {page['title']}")
|
|
460
|
+
print(f" {page['url']}")
|
|
461
|
+
if len(scored) > 30:
|
|
462
|
+
print(f"\n...及另外 {len(scored) - 30} 篇")
|
|
376
463
|
|
|
377
464
|
|
|
378
465
|
async def cmd_get(args):
|
package/scripts/runner.js
CHANGED
|
@@ -102,8 +102,8 @@ async function ensurePython() {
|
|
|
102
102
|
});
|
|
103
103
|
} catch { /* non-fatal */ }
|
|
104
104
|
|
|
105
|
-
console.error(`${TAG} 正在安装 Python 依赖 (httpx)...`);
|
|
106
|
-
execSync(`"${getVenvPip()}" install httpx`, {
|
|
105
|
+
console.error(`${TAG} 正在安装 Python 依赖 (httpx, rapidfuzz, jieba)...`);
|
|
106
|
+
execSync(`"${getVenvPip()}" install httpx rapidfuzz jieba`, {
|
|
107
107
|
stdio: 'pipe',
|
|
108
108
|
timeout: 120000,
|
|
109
109
|
env: cleanEnv,
|