deepresearch-flow 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/__main__.py +7 -0
- deepresearch_flow/paper/db.py +34 -1
- deepresearch_flow/paper/extract.py +10 -3
- deepresearch_flow/paper/web/app.py +1417 -356
- {deepresearch_flow-0.1.1.dist-info → deepresearch_flow-0.2.0.dist-info}/METADATA +14 -6
- {deepresearch_flow-0.1.1.dist-info → deepresearch_flow-0.2.0.dist-info}/RECORD +10 -9
- {deepresearch_flow-0.1.1.dist-info → deepresearch_flow-0.2.0.dist-info}/WHEEL +0 -0
- {deepresearch_flow-0.1.1.dist-info → deepresearch_flow-0.2.0.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.1.1.dist-info → deepresearch_flow-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.1.1.dist-info → deepresearch_flow-0.2.0.dist-info}/top_level.txt +0 -0
deepresearch_flow/paper/db.py
CHANGED
|
@@ -30,7 +30,12 @@ except ImportError:
|
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
def load_json(path: Path) -> list[dict[str, Any]]:
|
|
33
|
-
|
|
33
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
34
|
+
if isinstance(data, dict) and isinstance(data.get("papers"), list):
|
|
35
|
+
return data["papers"]
|
|
36
|
+
if isinstance(data, list):
|
|
37
|
+
return data
|
|
38
|
+
raise click.ClickException("Input JSON must be a list or {template_tag, papers}")
|
|
34
39
|
|
|
35
40
|
|
|
36
41
|
def write_json(path: Path, data: Any) -> None:
|
|
@@ -378,9 +383,25 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
378
383
|
month_counts: dict[str, int] = {}
|
|
379
384
|
author_counts: dict[str, int] = {}
|
|
380
385
|
tag_counts: dict[str, int] = {}
|
|
386
|
+
keyword_counts: dict[str, int] = {}
|
|
381
387
|
journal_counts: dict[str, int] = {}
|
|
382
388
|
conference_counts: dict[str, int] = {}
|
|
383
389
|
other_venue_counts: dict[str, int] = {}
|
|
390
|
+
def normalize_keywords(value: Any) -> list[str]:
|
|
391
|
+
if value is None:
|
|
392
|
+
return []
|
|
393
|
+
if isinstance(value, list):
|
|
394
|
+
items = value
|
|
395
|
+
elif isinstance(value, str):
|
|
396
|
+
items = re.split(r"[;,]", value)
|
|
397
|
+
else:
|
|
398
|
+
items = [value]
|
|
399
|
+
normalized: list[str] = []
|
|
400
|
+
for item in items:
|
|
401
|
+
token = str(item).strip().lower()
|
|
402
|
+
if token:
|
|
403
|
+
normalized.append(token)
|
|
404
|
+
return normalized
|
|
384
405
|
for paper in papers:
|
|
385
406
|
bibtex_fields = {}
|
|
386
407
|
bibtex_type = None
|
|
@@ -406,6 +427,8 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
406
427
|
author_counts[author] = author_counts.get(author, 0) + 1
|
|
407
428
|
for tag in paper.get("ai_generated_tags") or []:
|
|
408
429
|
tag_counts[tag] = tag_counts.get(tag, 0) + 1
|
|
430
|
+
for keyword in normalize_keywords(paper.get("keywords")):
|
|
431
|
+
keyword_counts[keyword] = keyword_counts.get(keyword, 0) + 1
|
|
409
432
|
|
|
410
433
|
venue = None
|
|
411
434
|
if bibtex_type in {"article"}:
|
|
@@ -541,6 +564,16 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
541
564
|
tag_table.add_row(tag, str(count), f"{percentage:.1f}%")
|
|
542
565
|
console.print(tag_table)
|
|
543
566
|
|
|
567
|
+
if keyword_counts:
|
|
568
|
+
keyword_table = Table(title=f"Top {top_n} Keywords")
|
|
569
|
+
keyword_table.add_column("Keyword", style="cyan")
|
|
570
|
+
keyword_table.add_column("Count", style="green", justify="right")
|
|
571
|
+
keyword_table.add_column("Percentage", style="yellow", justify="right")
|
|
572
|
+
for keyword, count in sorted(keyword_counts.items(), key=lambda item: item[1], reverse=True)[:top_n]:
|
|
573
|
+
percentage = (count / total * 100) if total else 0
|
|
574
|
+
keyword_table.add_row(keyword, str(count), f"{percentage:.1f}%")
|
|
575
|
+
console.print(keyword_table)
|
|
576
|
+
|
|
544
577
|
@db_group.command("serve")
|
|
545
578
|
@click.option("-i", "--input", "input_paths", multiple=True, required=True, help="Input JSON file path")
|
|
546
579
|
@click.option("-b", "--bibtex", "bibtex_path", default=None, help="Optional BibTeX file path")
|
|
@@ -285,9 +285,14 @@ def load_existing(path: Path) -> list[dict[str, Any]]:
|
|
|
285
285
|
if not path.exists():
|
|
286
286
|
return []
|
|
287
287
|
try:
|
|
288
|
-
|
|
288
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
289
289
|
except json.JSONDecodeError:
|
|
290
290
|
return []
|
|
291
|
+
if isinstance(data, dict) and isinstance(data.get("papers"), list):
|
|
292
|
+
return data["papers"]
|
|
293
|
+
if isinstance(data, list):
|
|
294
|
+
return data
|
|
295
|
+
return []
|
|
291
296
|
|
|
292
297
|
|
|
293
298
|
def load_errors(path: Path) -> list[dict[str, Any]]:
|
|
@@ -462,6 +467,7 @@ async def extract_documents(
|
|
|
462
467
|
) -> None:
|
|
463
468
|
start_time = time.monotonic()
|
|
464
469
|
markdown_files = discover_markdown(inputs, glob_pattern, recursive=True)
|
|
470
|
+
template_tag = prompt_template if not custom_prompt else "custom"
|
|
465
471
|
|
|
466
472
|
if retry_failed:
|
|
467
473
|
error_entries = load_errors(errors_path)
|
|
@@ -798,7 +804,8 @@ async def extract_documents(
|
|
|
798
804
|
if path not in seen:
|
|
799
805
|
final_results.append(entry)
|
|
800
806
|
|
|
801
|
-
|
|
807
|
+
output_payload = {"template_tag": template_tag, "papers": final_results}
|
|
808
|
+
write_json(output_path, output_payload)
|
|
802
809
|
|
|
803
810
|
error_payload = [
|
|
804
811
|
{
|
|
@@ -823,7 +830,7 @@ async def extract_documents(
|
|
|
823
830
|
continue
|
|
824
831
|
base_name = split_output_name(Path(source_path))
|
|
825
832
|
file_name = unique_split_name(base_name, used_names, source_path)
|
|
826
|
-
write_json(target_dir / f"{file_name}.json", entry)
|
|
833
|
+
write_json(target_dir / f"{file_name}.json", {"template_tag": template_tag, "papers": [entry]})
|
|
827
834
|
|
|
828
835
|
if render_md:
|
|
829
836
|
try:
|