PyPI - deepresearch-flow - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

deepresearch-flow 0.3.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

deepresearch_flow/paper/db.py +184 -0
deepresearch_flow/paper/db_ops.py +1939 -0
deepresearch_flow/paper/web/app.py +38 -3705
deepresearch_flow/paper/web/constants.py +23 -0
deepresearch_flow/paper/web/filters.py +255 -0
deepresearch_flow/paper/web/handlers/__init__.py +14 -0
deepresearch_flow/paper/web/handlers/api.py +217 -0
deepresearch_flow/paper/web/handlers/pages.py +334 -0
deepresearch_flow/paper/web/markdown.py +549 -0
deepresearch_flow/paper/web/static/css/main.css +857 -0
deepresearch_flow/paper/web/static/js/detail.js +406 -0
deepresearch_flow/paper/web/static/js/index.js +266 -0
deepresearch_flow/paper/web/static/js/outline.js +58 -0
deepresearch_flow/paper/web/static/js/stats.js +39 -0
deepresearch_flow/paper/web/templates/base.html +43 -0
deepresearch_flow/paper/web/templates/detail.html +332 -0
deepresearch_flow/paper/web/templates/index.html +114 -0
deepresearch_flow/paper/web/templates/stats.html +29 -0
deepresearch_flow/paper/web/templates.py +85 -0
deepresearch_flow/paper/web/text.py +68 -0
deepresearch_flow/recognize/cli.py +805 -26
deepresearch_flow/recognize/katex_check.js +29 -0
deepresearch_flow/recognize/math.py +719 -0
deepresearch_flow/recognize/mermaid.py +690 -0
{deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/METADATA +78 -4
{deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/RECORD +30 -9
{deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/WHEEL +0 -0
{deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/entry_points.txt +0 -0
{deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/licenses/LICENSE +0 -0
{deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/top_level.txt +0 -0

deepresearch_flow/paper/web/constants.py ADDED Viewed

@@ -0,0 +1,23 @@
+"""Constants for paper web UI."""
+from pathlib import Path
+# CDN URLs for external libraries
+CDN_ECHARTS = "https://cdn.jsdelivr.net/npm/echarts@5/dist/echarts.min.js"
+CDN_MERMAID = "https://cdn.jsdelivr.net/npm/mermaid@11/dist/mermaid.min.js"
+CDN_KATEX = "https://cdn.jsdelivr.net/npm/katex@0.16.27/dist/katex.min.css"
+CDN_KATEX_JS = "https://cdn.jsdelivr.net/npm/katex@0.16.27/dist/katex.min.js"
+CDN_KATEX_AUTO = "https://cdn.jsdelivr.net/npm/katex@0.16.27/dist/contrib/auto-render.min.js"
+# Use legacy builds to ensure `pdfjsLib` is available as a global.
+CDN_PDFJS = "https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/legacy/build/pdf.min.js"
+CDN_PDFJS_WORKER = "https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/legacy/build/pdf.worker.min.js"
+# PDF.js viewer configuration
+PDFJS_VIEWER_PATH = "/pdfjs/web/viewer.html"
+PDFJS_STATIC_DIR = Path(__file__).resolve().parent / "pdfjs"
+STATIC_DIR = Path(__file__).resolve().parent / "static"
+TEMPLATES_DIR = Path(__file__).resolve().parent / "templates"
+# Metadata
+REPO_URL = "https://github.com/nerdneilsfield/ai-deepresearch-flow"

deepresearch_flow/paper/web/filters.py ADDED Viewed

@@ -0,0 +1,255 @@
+"""Filter, query, and statistics utilities for paper web UI."""
+from __future__ import annotations
+from typing import Any
+from starlette.requests import Request
+from deepresearch_flow.paper.db_ops import PaperIndex
+from deepresearch_flow.paper.utils import stable_hash
+BOOL_TRUE = {"1", "true", "yes", "with", "has"}
+BOOL_FALSE = {"0", "false", "no", "without"}
+def tokenize_filter_query(text: str) -> list[str]:
+    """Tokenize a filter query string, respecting quoted phrases."""
+    out: list[str] = []
+    buf: list[str] = []
+    in_quote = False
+    for ch in text:
+        if ch == '"':
+            in_quote = not in_quote
+            continue
+        if not in_quote and ch.isspace():
+            token = "".join(buf).strip()
+            if token:
+                out.append(token)
+            buf = []
+            continue
+        buf.append(ch)
+    token = "".join(buf).strip()
+    if token:
+        out.append(token)
+    return out
+def normalize_presence_value(value: str) -> str | None:
+    """Normalize a presence filter value to 'with' or 'without'."""
+    token = value.strip().lower()
+    if token in BOOL_TRUE:
+        return "with"
+    if token in BOOL_FALSE:
+        return "without"
+    return None
+def parse_filter_query(text: str) -> dict[str, set[str]]:
+    """Parse a filter query string into structured filters."""
+    parsed = {
+        "pdf": set(),
+        "source": set(),
+        "summary": set(),
+        "translated": set(),
+        "template": set(),
+    }
+    for token in tokenize_filter_query(text):
+        if ":" not in token:
+            continue
+        key, raw_value = token.split(":", 1)
+        key = key.strip().lower()
+        raw_value = raw_value.strip()
+        if not raw_value:
+            continue
+        if key in {"tmpl", "template"}:
+            for part in raw_value.split(","):
+                tag = part.strip()
+                if tag:
+                    parsed["template"].add(tag.lower())
+            continue
+        if key in {"pdf", "source", "summary", "translated"}:
+            for part in raw_value.split(","):
+                normalized = normalize_presence_value(part)
+                if normalized:
+                    parsed[key].add(normalized)
+            continue
+        if key in {"has", "no"}:
+            targets = [part.strip().lower() for part in raw_value.split(",") if part.strip()]
+            for target in targets:
+                if target not in {"pdf", "source", "summary", "translated"}:
+                    continue
+                parsed[target].add("with" if key == "has" else "without")
+    return parsed
+def presence_filter(values: list[str]) -> set[str] | None:
+    """Convert a list of presence filter values to a normalized set."""
+    normalized = set()
+    for value in values:
+        token = normalize_presence_value(value)
+        if token:
+            normalized.add(token)
+    if not normalized or normalized == {"with", "without"}:
+        return None
+    return normalized
+def merge_filter_set(primary: set[str] | None, secondary: set[str] | None) -> set[str] | None:
+    """Merge two filter sets with AND logic."""
+    if not primary:
+        return secondary
+    if not secondary:
+        return primary
+    return primary & secondary
+def matches_presence(allowed: set[str] | None, has_value: bool) -> bool:
+    """Check if a value matches a presence filter."""
+    if not allowed:
+        return True
+    if has_value and "with" in allowed:
+        return True
+    if not has_value and "without" in allowed:
+        return True
+    return False
+def template_tag_map(index: PaperIndex) -> dict[str, str]:
+    """Create a mapping from lowercase template tags to display tags."""
+    return {tag.lower(): tag for tag in index.template_tags}
+def compute_counts(index: PaperIndex, ids: set[int]) -> dict[str, Any]:
+    """Compute statistics for a set of paper IDs."""
+    template_order = list(index.template_tags)
+    template_counts = {tag: 0 for tag in template_order}
+    pdf_count = 0
+    source_count = 0
+    summary_count = 0
+    translated_count = 0
+    total_count = 0
+    tag_map = template_tag_map(index)
+    for idx in ids:
+        paper = index.papers[idx]
+        if paper.get("_is_pdf_only"):
+            continue
+        total_count += 1
+        source_hash = str(paper.get("source_hash") or stable_hash(str(paper.get("source_path") or idx)))
+        has_source = source_hash in index.md_path_by_hash
+        has_pdf = source_hash in index.pdf_path_by_hash
+        has_summary = bool(paper.get("_has_summary"))
+        has_translated = bool(index.translated_md_by_hash.get(source_hash))
+        if has_source:
+            source_count += 1
+        if has_pdf:
+            pdf_count += 1
+        if has_summary:
+            summary_count += 1
+        if has_translated:
+            translated_count += 1
+        for tag_lc in paper.get("_template_tags_lc") or []:
+            display = tag_map.get(tag_lc)
+            if display:
+                template_counts[display] = template_counts.get(display, 0) + 1
+    return {
+        "total": total_count,
+        "pdf": pdf_count,
+        "source": source_count,
+        "summary": summary_count,
+        "translated": translated_count,
+        "templates": template_counts,
+        "template_order": template_order,
+    }
+def parse_filters(request: Request) -> dict[str, list[str] | str | int]:
+    """Parse filters from request query parameters."""
+    qp = request.query_params
+    page = int(qp.get("page", "1"))
+    page_size = int(qp.get("page_size", "30"))
+    page = max(1, page)
+    page_size = min(max(1, page_size), 200)
+    q = qp.get("q", "").strip()
+    filter_query = qp.get("fq", "").strip()
+    pdf_filters = [item for item in qp.getlist("pdf") if item]
+    source_filters = [item for item in qp.getlist("source") if item]
+    summary_filters = [item for item in qp.getlist("summary") if item]
+    translated_filters = [item for item in qp.getlist("translated") if item]
+    template_filters = [item for item in qp.getlist("template") if item]
+    sort_by = qp.get("sort_by", "").strip()
+    sort_dir = qp.get("sort_dir", "desc").strip().lower()
+    if sort_dir not in {"asc", "desc"}:
+        sort_dir = "desc"
+    return {
+        "page": page,
+        "page_size": page_size,
+        "q": q,
+        "filter_query": filter_query,
+        "pdf": pdf_filters,
+        "source": source_filters,
+        "summary": summary_filters,
+        "translated": translated_filters,
+        "template": template_filters,
+        "sort_by": sort_by,
+        "sort_dir": sort_dir,
+    }
+def safe_int(value: Any) -> int:
+    """Safely convert a value to int, returning 0 on error."""
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return 0
+def normalize_sort_value(value: Any) -> str:
+    """Normalize a value for sorting."""
+    return str(value or "").strip().lower()
+def sorted_ids(
+    index: PaperIndex,
+    ids: set[int],
+    sort_by: str,
+    sort_dir: str,
+) -> list[int]:
+    """Sort paper IDs according to sort criteria."""
+    if not sort_by:
+        return [idx for idx in index.ordered_ids if idx in ids]
+    reverse = sort_dir == "desc"
+    def sort_value(idx: int) -> tuple[Any, bool]:
+        paper = index.papers[idx]
+        if sort_by == "year":
+            year = safe_int(paper.get("_year"))
+            month = safe_int(paper.get("_month"))
+            return (year, month), year == 0
+        if sort_by == "title":
+            value = normalize_sort_value(paper.get("paper_title"))
+            return value, not bool(value)
+        if sort_by == "venue":
+            value = normalize_sort_value(paper.get("_venue"))
+            return value, not bool(value)
+        if sort_by == "author":
+            authors = paper.get("_authors") or paper.get("authors") or []
+            value = normalize_sort_value(authors[0] if authors else "")
+            return value, not bool(value)
+        return normalize_sort_value(paper.get("paper_title")), False
+    def key_fn(idx: int) -> tuple[int, Any, int]:
+        value, missing = sort_value(idx)
+        missing_score = 0 if missing else 1
+        if not reverse:
+            missing_score = 1 if missing else 0
+        return (missing_score, value, idx)
+    return sorted(ids, key=key_fn, reverse=reverse)

deepresearch_flow/paper/web/handlers/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""Route handlers for paper web UI."""
+from .api import api_papers, api_pdf, api_stats
+from .pages import index_page, paper_detail, robots_txt, stats_page
+__all__ = [
+    "api_papers",
+    "api_pdf",
+    "api_stats",
+    "index_page",
+    "paper_detail",
+    "robots_txt",
+    "stats_page",
+]

deepresearch_flow/paper/web/handlers/api.py ADDED Viewed

@@ -0,0 +1,217 @@
+"""API route handlers for paper web UI."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+from starlette.requests import Request
+from starlette.responses import FileResponse, JSONResponse, Response
+from deepresearch_flow.paper.db_ops import PaperIndex
+from deepresearch_flow.paper.utils import stable_hash
+from deepresearch_flow.paper.web.filters import (
+    compute_counts,
+    matches_presence,
+    merge_filter_set,
+    parse_filters,
+    parse_filter_query,
+    presence_filter,
+    sorted_ids,
+)
+from deepresearch_flow.paper.web.text import extract_summary_snippet, normalize_title, normalize_venue
+from deepresearch_flow.paper.web.query import Query, QueryTerm, parse_query
+def _ensure_under_roots(path: Path, roots: list[Path]) -> bool:
+    """Check if path is under one of the allowed root directories."""
+    resolved = path.resolve()
+    for root in roots:
+        try:
+            resolved.relative_to(root.resolve())
+            return True
+        except Exception:
+            continue
+    return False
+def _apply_query(index: PaperIndex, query: Query) -> set[int]:
+    """Apply a search query to the paper index and return matching IDs."""
+    all_ids = set(index.ordered_ids)
+    def ids_for_term(term: QueryTerm, base: set[int]) -> set[int]:
+        value_lc = term.value.lower()
+        if term.field is None:
+            return {idx for idx in base if value_lc in str(index.papers[idx].get("_search_lc") or "")}
+        if term.field == "title":
+            return {idx for idx in base if value_lc in str(index.papers[idx].get("_title_lc") or "")}
+        if term.field == "venue":
+            return {idx for idx in base if value_lc in str(index.papers[idx].get("_venue") or "").lower()}
+        if term.field == "tag":
+            exact = index.by_tag.get(value_lc)
+            if exact is not None:
+                return exact & base
+            return {idx for idx in base if any(value_lc in t.lower() for t in (index.papers[idx].get("_tags") or []))}
+        if term.field == "author":
+            exact = index.by_author.get(value_lc)
+            if exact is not None:
+                return exact & base
+            return {idx for idx in base if any(value_lc in a.lower() for a in (index.papers[idx].get("_authors") or []))}
+        if term.field == "month":
+            exact = index.by_month.get(value_lc)
+            if exact is not None:
+                return exact & base
+            return {idx for idx in base if value_lc == str(index.papers[idx].get("_month") or "").lower()}
+        if term.field == "year":
+            if ".." in term.value:
+                start_str, end_str = term.value.split("..", 1)
+                if start_str.strip().isdigit() and end_str.strip().isdigit():
+                    start = int(start_str.strip())
+                    end = int(end_str.strip())
+                    ids: set[int] = set()
+                    for y in range(min(start, end), max(start, end) + 1):
+                        ids |= index.by_year.get(str(y), set())
+                    return ids & base
+            exact = index.by_year.get(value_lc)
+            if exact is not None:
+                return exact & base
+            return {idx for idx in base if value_lc in str(index.papers[idx].get("_year") or "").lower()}
+        return set()
+    result: set[int] = set()
+    for group in query.groups:
+        group_ids = set(all_ids)
+        for term in group:
+            matched = ids_for_term(term, group_ids if not term.negated else all_ids)
+            if term.negated:
+                group_ids -= matched
+            else:
+                group_ids &= matched
+        result |= group_ids
+    return result
+async def api_papers(request: Request) -> JSONResponse:
+    """API endpoint for paper list with filtering, sorting, and pagination."""
+    index: PaperIndex = request.app.state.index
+    filters = parse_filters(request)
+    page = int(filters["page"])
+    page_size = int(filters["page_size"])
+    q = str(filters["q"])
+    filter_query = str(filters["filter_query"])
+    sort_by = str(filters["sort_by"]).strip().lower()
+    sort_dir = str(filters["sort_dir"]).strip().lower()
+    if sort_by not in {"year", "title", "venue", "author"}:
+        sort_by = ""
+    query = parse_query(q)
+    candidate = _apply_query(index, query)
+    filter_terms = parse_filter_query(filter_query)
+    pdf_filter = merge_filter_set(presence_filter(filters["pdf"]), presence_filter(list(filter_terms["pdf"])))
+    source_filter = merge_filter_set(
+        presence_filter(filters["source"]), presence_filter(list(filter_terms["source"]))
+    )
+    summary_filter = merge_filter_set(
+        presence_filter(filters["summary"]), presence_filter(list(filter_terms["summary"]))
+    )
+    translated_filter = merge_filter_set(
+        presence_filter(filters["translated"]), presence_filter(list(filter_terms["translated"]))
+    )
+    template_selected = {item.lower() for item in filters["template"] if item}
+    template_filter = merge_filter_set(
+        template_selected or None,
+        filter_terms["template"] or None,
+    )
+    if candidate:
+        filtered: set[int] = set()
+        for idx in candidate:
+            paper = index.papers[idx]
+            source_hash = str(paper.get("source_hash") or stable_hash(str(paper.get("source_path") or idx)))
+            has_source = source_hash in index.md_path_by_hash
+            has_pdf = source_hash in index.pdf_path_by_hash
+            has_summary = bool(paper.get("_has_summary"))
+            has_translated = bool(index.translated_md_by_hash.get(source_hash))
+            if not matches_presence(pdf_filter, has_pdf):
+                continue
+            if not matches_presence(source_filter, has_source):
+                continue
+            if not matches_presence(summary_filter, has_summary):
+                continue
+            if not matches_presence(translated_filter, has_translated):
+                continue
+            if template_filter:
+                tags = paper.get("_template_tags_lc") or []
+                if not any(tag in template_filter for tag in tags):
+                    continue
+            filtered.add(idx)
+        candidate = filtered
+    ordered = sorted_ids(index, candidate, sort_by, sort_dir)
+    total = len(ordered)
+    start = (page - 1) * page_size
+    end = min(start + page_size, total)
+    page_ids = ordered[start:end]
+    stats_payload = None
+    if page == 1:
+        all_ids = set(index.ordered_ids)
+        stats_payload = {
+            "all": compute_counts(index, all_ids),
+            "filtered": compute_counts(index, candidate),
+        }
+    items: list[dict[str, Any]] = []
+    for idx in page_ids:
+        paper = index.papers[idx]
+        source_hash = str(paper.get("source_hash") or stable_hash(str(paper.get("source_path") or idx)))
+        translations = index.translated_md_by_hash.get(source_hash, {})
+        translation_languages = sorted(translations.keys(), key=str.lower)
+        items.append(
+            {
+                "source_hash": source_hash,
+                "title": normalize_title(paper.get("paper_title") or ""),
+                "summary_excerpt": extract_summary_snippet(paper),
+                "summary_full": paper.get("summary") or "",
+                "authors": paper.get("_authors") or [],
+                "year": paper.get("_year") or "",
+                "month": paper.get("_month") or "",
+                "venue": normalize_venue(paper.get("_venue") or ""),
+                "tags": paper.get("_tags") or [],
+                "template_tags": paper.get("_template_tags") or [],
+                "has_source": source_hash in index.md_path_by_hash,
+                "has_translation": bool(translation_languages),
+                "has_pdf": source_hash in index.pdf_path_by_hash,
+                "has_summary": bool(paper.get("_has_summary")),
+                "is_pdf_only": bool(paper.get("_is_pdf_only")),
+                "translation_languages": translation_languages,
+            }
+        )
+    return JSONResponse(
+        {
+            "page": page,
+            "page_size": page_size,
+            "total": total,
+            "has_more": end < total,
+            "items": items,
+            "stats": stats_payload,
+        }
+    )
+async def api_stats(request: Request) -> JSONResponse:
+    """API endpoint for database statistics."""
+    index: PaperIndex = request.app.state.index
+    return JSONResponse(index.stats)
+async def api_pdf(request: Request) -> Response:
+    """API endpoint to serve PDF files."""
+    index: PaperIndex = request.app.state.index
+    source_hash = request.path_params["source_hash"]
+    pdf_path = index.pdf_path_by_hash.get(source_hash)
+    if not pdf_path:
+        return Response("PDF not found", status_code=404)
+    allowed_roots: list[Path] = request.app.state.pdf_roots
+    if allowed_roots and not _ensure_under_roots(pdf_path, allowed_roots):
+        return Response("Forbidden", status_code=403)
+    return FileResponse(pdf_path)

deepresearch-flow 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

deepresearch-flow 0.3.0py3-none-any.whl → 0.4.1py3-none-any.whl