PyPI - outliner-cli - Versions diffs - 0.2.0__tar.gz → 0.3.0__tar.gz - Mend

outliner-cli 0.2.0tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

{outliner_cli-0.2.0 → outliner_cli-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: outliner-cli
-Version: 0.2.0
+Version: 0.3.0
 Summary: Print the structural outline of source files for LLM navigation
 Author: Per Cederberg
 License-Expression: MIT
@@ -27,6 +27,7 @@ outliner-cli [OPTIONS] [FILE...]
 | ------------------- | ----------------------------------------------------------------------------- |
 | `-g, --grep EXPR`   | Only show items whose signature matches EXPR (case-insensitive)               |
 | `-s, --syntax LANG` | Override syntax auto-detection when it is ambiguous                           |
+| `-t, --type LANG`   | Only include files of this language (repeatable, accepts name or extension)   |
 | `-w, --width COLS`  | Truncate output lines to COLS (`0`=unlimited, `auto`=terminal, default=`120`) |
 Pass a file, a directory (walked recursively), or omit arguments to read stdin.
@@ -57,7 +58,7 @@ methods) and native-format indentation in the signature (indented for code,
 ## Installation
 ```sh
-pip install outliner
+pip install outliner-cli
 ```
 ## Running
@@ -79,9 +80,9 @@ uv run pytest
 ## Supported Languages
-AsciiDoc, C/C++, C#, Clojure, Go, HTML, Java, JavaScript/TypeScript, Markdown,
-Org-mode, Perl, PHP, Python, reStructuredText, Ruby, Rust, Scala, Shell, Swift,
-and Zig.
+AsciiDoc, C/C++, C#, Clojure, Go, HTML, Java, JavaScript/TypeScript,
+JSON/NDJSON, Markdown, Org-mode, Perl, PHP, Python, reStructuredText, Ruby,
+Rust, Scala, Shell, Swift, XML, and Zig.
 ## Example Use Cases
@@ -132,3 +133,32 @@ $ uvx outliner-cli --grep PaymentMethod src/
  14,12  def charge(method: PaymentMethod, amount: Decimal) -> Receipt
  88,4   def validate(m: PaymentMethod) -> bool
 ```
+**Inspect a dataset without opening it** — JSON/NDJSON files show a schema
+overview with file size, record count, data types, optionality, and truncated
+sample values:
+```
+$ uvx outliner-cli titanic.json
+$             163.9 KB · json · array[891]
+.Age          float|int? -- 22
+.Cabin        str? -- "C85"
+.Embarked     str? -- "S"
+.Fare         float|int -- 7.25
+.Name         str -- "Braund, Mr. Owen Harris"
+.Survived     int -- 0
+```
+XML files show an indented structural outline with XML-native node kinds:
+```
+$ uvx outliner-cli pubmed26n0001.xml
+/                                195.5 MB · xml · sampled 204K elems
+<PubmedArticleSet>               elem
+  <PubmedArticle>                elem+
+    <MedlineCitation>            elem
+      @Status                    attr -- "MEDLINE"
+      <Article>                  elem
+        <ArticleTitle>           text -- "Formate assay in body fluids: applica..."
+        <Abstract>               elem?
+```

{outliner_cli-0.2.0 → outliner_cli-0.3.0}/README.md RENAMED Viewed

@@ -14,6 +14,7 @@ outliner-cli [OPTIONS] [FILE...]
 | ------------------- | ----------------------------------------------------------------------------- |
 | `-g, --grep EXPR`   | Only show items whose signature matches EXPR (case-insensitive)               |
 | `-s, --syntax LANG` | Override syntax auto-detection when it is ambiguous                           |
+| `-t, --type LANG`   | Only include files of this language (repeatable, accepts name or extension)   |
 | `-w, --width COLS`  | Truncate output lines to COLS (`0`=unlimited, `auto`=terminal, default=`120`) |
 Pass a file, a directory (walked recursively), or omit arguments to read stdin.
@@ -44,7 +45,7 @@ methods) and native-format indentation in the signature (indented for code,
 ## Installation
 ```sh
-pip install outliner
+pip install outliner-cli
 ```
 ## Running
@@ -66,9 +67,9 @@ uv run pytest
 ## Supported Languages
-AsciiDoc, C/C++, C#, Clojure, Go, HTML, Java, JavaScript/TypeScript, Markdown,
-Org-mode, Perl, PHP, Python, reStructuredText, Ruby, Rust, Scala, Shell, Swift,
-and Zig.
+AsciiDoc, C/C++, C#, Clojure, Go, HTML, Java, JavaScript/TypeScript,
+JSON/NDJSON, Markdown, Org-mode, Perl, PHP, Python, reStructuredText, Ruby,
+Rust, Scala, Shell, Swift, XML, and Zig.
 ## Example Use Cases
@@ -119,3 +120,32 @@ $ uvx outliner-cli --grep PaymentMethod src/
  14,12  def charge(method: PaymentMethod, amount: Decimal) -> Receipt
  88,4   def validate(m: PaymentMethod) -> bool
 ```
+**Inspect a dataset without opening it** — JSON/NDJSON files show a schema
+overview with file size, record count, data types, optionality, and truncated
+sample values:
+```
+$ uvx outliner-cli titanic.json
+$             163.9 KB · json · array[891]
+.Age          float|int? -- 22
+.Cabin        str? -- "C85"
+.Embarked     str? -- "S"
+.Fare         float|int -- 7.25
+.Name         str -- "Braund, Mr. Owen Harris"
+.Survived     int -- 0
+```
+XML files show an indented structural outline with XML-native node kinds:
+```
+$ uvx outliner-cli pubmed26n0001.xml
+/                                195.5 MB · xml · sampled 204K elems
+<PubmedArticleSet>               elem
+  <PubmedArticle>                elem+
+    <MedlineCitation>            elem
+      @Status                    attr -- "MEDLINE"
+      <Article>                  elem
+        <ArticleTitle>           text -- "Formate assay in body fluids: applica..."
+        <Abstract>               elem?
+```

{outliner_cli-0.2.0 → outliner_cli-0.3.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "outliner-cli"
-version = "0.2.0"
+version = "0.3.0"
 description = "Print the structural outline of source files for LLM navigation"
 authors = [{name = "Per Cederberg"}]
 license = "MIT"

{outliner_cli-0.2.0 → outliner_cli-0.3.0}/src/outliner/cli.py RENAMED Viewed

@@ -10,6 +10,9 @@ import sys
 from outliner.parsers import NAMES, EXTENSIONS, detect, outline, syntax
 from outliner.types import OutlineItem
+_TEXT_CONTROLS = "\n\r\t\f\b"
+_BINARY_THRESHOLD = 0.05
 def die(msg: str, code: int = 2) -> None:
     print(f"outliner: {msg}", file=sys.stderr)
@@ -78,9 +81,47 @@ def _format_items(items: list[OutlineItem], grep: re.Pattern | None, line_width:
         items = [it for it in items if grep.search(it.signature)]
     if not items:
         return []
-    num_width = max(it.num_width for it in items)
-    num_width = max(num_width, 3)
-    return [it.format(num_width, line_width) for it in items]
+    fmt_width = max(it.fmt_width for it in items)
+    fmt_width = max(fmt_width, 3)
+    return [it.format(fmt_width, line_width) for it in items]
+def _looks_binary(head: str) -> bool:
+    if "\0" in head:
+        return True
+    if head:
+        controls = sum(1 for ch in head if ord(ch) < 32 and ch not in _TEXT_CONTROLS)
+        replaced = head.count("\ufffd")
+        return (controls + replaced) / len(head) > _BINARY_THRESHOLD
+    return False
+def _format_size(size_bytes: int) -> str:
+    if size_bytes >= 1_000_000_000:
+        return f"{size_bytes / 1_000_000_000:.1f} GB"
+    if size_bytes >= 1_000_000:
+        return f"{size_bytes / 1_000_000:.1f} MB"
+    if size_bytes >= 1_000:
+        return f"{size_bytes / 1_000:.1f} KB"
+    return f"{size_bytes} B"
+def _outline_source(src: str, selected: str | None) -> tuple[list[OutlineItem] | None, str | None]:
+    if src == "-":
+        if selected:
+            return outline(selected, sys.stdin), selected
+        text = sys.stdin.read()
+        match = selected or detect(text)
+        return (outline(match, text) if match else None), match
+    with open(src, encoding="utf-8", errors="replace") as fh:
+        head = fh.read(4096)
+        if _looks_binary(head):
+            size = _format_size(os.path.getsize(src))
+            return [OutlineItem(locator="binary file", signature=size)], "binary"
+        match = selected or guess_syntax(src) or detect(head)
+        fh.seek(0)
+        return (outline(match, fh) if match else None), match
 def main(argv: list[str] | None = None) -> int:
@@ -143,25 +184,18 @@ def main(argv: list[str] | None = None) -> int:
     exit_code = 0
     for src in sources:
         try:
-            if src == "-":
-                text = sys.stdin.read()
-            else:
-                with open(src, encoding="utf-8", errors="replace") as fh:
-                    text = fh.read()
+            items, match = _outline_source(src, args.syntax)
         except OSError as exc:
             print(f"outliner: {exc}", file=sys.stderr)
             exit_code = 1
             continue
-        match = args.syntax or guess_syntax(src) or detect(text)
         if match is None:
             print(f"outliner: cannot auto-detect syntax for '{src}'; use --syntax",
                   file=sys.stderr)
             exit_code = 2
             continue
-        items = outline(match, text)
         if items is None:
             available = ", ".join(NAMES)
             print(f"outliner: unsupported syntax '{match}'; available: {available}",

outliner_cli-0.3.0/src/outliner/parsers/__init__.py ADDED Viewed

@@ -0,0 +1,65 @@
+import io
+import re
+import types
+from typing import TextIO
+from ..types import OutlineItem
+from . import (
+    python, scala, go, java, rust, swift, c, ruby, php, shell, javascript,
+    csharp, perl, zig, clojure, html, asciidoc, orgmode, rst, json, xml,
+    markdown,
+)
+_MODULES = {
+    mod.SYNTAX: mod
+    for mod in globals().values() if (
+        isinstance(mod, types.ModuleType)
+        and mod.__name__.startswith(f"{__name__}.")
+        and hasattr(mod, "SYNTAX")
+        and hasattr(mod, "EXTENSIONS")
+    )
+}
+NAMES = sorted(_MODULES)
+EXTENSIONS = {ext: syntax for syntax, mod in _MODULES.items() for ext in mod.EXTENSIONS}
+_FRONTMATTER_RE = re.compile(r'\A(?:---\n(?:.*\n){0,98}?---\n|\+\+\+\n(?:.*\n){0,98}?\+\+\+\n)')
+def _strip_frontmatter(content: str) -> str:
+    m = _FRONTMATTER_RE.match(content)
+    return content[m.end():] if m else content
+def syntax(name: str) -> str | None:
+    if name in _MODULES:
+        return name
+    ext = name if name.startswith(".") else "." + name
+    return EXTENSIONS.get(ext)
+def detect(content: str) -> str | None:
+    lines = _strip_frontmatter(content).splitlines()[:100]
+    for mod in _MODULES.values():
+        if mod.detect(lines):
+            return mod.SYNTAX
+    return None
+def outline(syntax: str, content: str | TextIO) -> list[OutlineItem] | None:
+    mod = _MODULES.get(syntax)
+    if not mod:
+        return None
+    elif hasattr(mod, "read"):
+        fh = io.StringIO(content) if isinstance(content, str) else content
+        return list(mod.read(fh))
+    else:
+        text = content.read() if hasattr(content, "read") else content
+        return _outline_text(mod, text)
+def _outline_text(mod, content: str) -> list[OutlineItem]:
+    m = _FRONTMATTER_RE.match(content)
+    if not m:
+        return list(mod.parse(content))
+    offset = m.group(0).count('\n')
+    return [OutlineItem(start=it.start + offset, count=it.count, signature=it.signature)
+            for it in mod.parse(content[m.end():])]

outliner_cli-0.3.0/src/outliner/parsers/html.py ADDED Viewed

@@ -0,0 +1,357 @@
+"""HTML outline parser using Python's tokenizing HTMLParser."""
+import html
+import re
+from collections.abc import Iterator
+from dataclasses import dataclass, field
+from html.parser import HTMLParser
+from outliner.types import OutlineItem
+SYNTAX = "html"
+EXTENSIONS = (".html", ".htm", ".xhtml")
+_DOCTYPE_RE = re.compile(r'^\s*<!DOCTYPE\s+html', re.IGNORECASE)
+_HTML_TAG_RE = re.compile(r'^\s*<html[\s>]', re.IGNORECASE)
+_WS_RE = re.compile(r'\s+')
+_SENTENCE_RE = re.compile(r'(?<=[.!?])\s+')
+_STRUCTURAL = {"head", "body"}
+_LANDMARKS = {"nav", "main", "article", "section", "header"}
+_OUTLINE_TAGS = _STRUCTURAL | _LANDMARKS
+_CONTENT_TAGS = {"body"} | _LANDMARKS
+_TEXT_SKIP_TAGS = {"script", "style", "svg", "noscript", "template"}
+_EXCERPT_LIMIT = 80
+_BORING_EXCERPTS = {
+    "advertisement", "close", "menu", "navigation", "open menu",
+    "search", "skip advertisement", "skip to content",
+}
+_VOID_TAGS = {
+    "area", "base", "br", "col", "embed", "hr", "img", "input",
+    "link", "meta", "param", "source", "track", "wbr",
+}
+@dataclass
+class _Node:
+    tag: str
+    start: int
+    start_col: int
+    attrs: dict[str, str]
+    depth: int
+    text_parts: list[str] = field(default_factory=list)
+    heading_text: str = ""
+    end: int | None = None
+    @property
+    def signature(self) -> str:
+        return _block_sig(
+            self.tag,
+            self.attrs,
+            self.heading_text or "".join(self.text_parts),
+            self.depth,
+        )
+    @property
+    def has_identity(self) -> bool:
+        return (
+            self.tag in _STRUCTURAL
+            or self.tag == "main"
+            or bool(self.attrs.get("id"))
+            or bool(_clean(self.attrs.get("aria-label", "")))
+            or bool(_excerpt(self.heading_text or "".join(self.text_parts)))
+        )
+@dataclass
+class _Heading:
+    tag: str
+    level: int
+    start: int
+    start_col: int
+    base_depth: int
+    context_key: int
+    text_parts: list[str] = field(default_factory=list)
+class _Parser(HTMLParser):
+    def __init__(self, line_count: int):
+        super().__init__(convert_charrefs=False)
+        self.line_count = line_count
+        self.nodes: list[_Node] = []
+        self.headings: list[tuple[int, int, int, str]] = []
+        self.titles: list[tuple[int, int, OutlineItem]] = []
+        self._stack: list[_Node] = []
+        self._heading: _Heading | None = None
+        self._title: _Heading | None = None
+        self._text_skip: list[str] = []
+        self._heading_stacks: dict[int, list[tuple[int, int]]] = {}
+    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        tag = tag.lower()
+        if self._text_skip:
+            if tag in _TEXT_SKIP_TAGS:
+                self._text_skip.append(tag)
+            return
+        if tag in _TEXT_SKIP_TAGS:
+            self._text_skip.append(tag)
+            return
+        line, column = self.getpos()
+        attrs_by_name = {name.lower(): value or "" for name, value in attrs}
+        if tag in _OUTLINE_TAGS:
+            if tag in _STRUCTURAL and self._inside_content():
+                return
+            node = _Node(
+                tag=tag,
+                start=line,
+                start_col=column,
+                attrs=attrs_by_name,
+                depth=self._tag_depth(tag),
+            )
+            self.nodes.append(node)
+            if tag not in _VOID_TAGS:
+                self._stack.append(node)
+        elif tag in _VOID_TAGS:
+            return
+        if _is_heading(tag):
+            self._heading = _Heading(
+                tag=tag,
+                level=int(tag[1]),
+                start=line,
+                start_col=column,
+                base_depth=self._heading_base_depth(),
+                context_key=self._heading_context_key(),
+            )
+        elif tag == "title" and self._inside_document_head():
+            self._title = _Heading(
+                tag=tag,
+                level=0,
+                start=line,
+                start_col=column,
+                base_depth=self._tag_depth(tag),
+                context_key=0,
+            )
+    def handle_startendtag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        tag = tag.lower()
+        if self._text_skip:
+            return
+        if tag in _OUTLINE_TAGS:
+            if tag in _STRUCTURAL and self._inside_content():
+                return
+            attrs_by_name = {name.lower(): value or "" for name, value in attrs}
+            line, column = self.getpos()
+            self.nodes.append(_Node(
+                tag=tag,
+                start=line,
+                start_col=column,
+                end=line,
+                attrs=attrs_by_name,
+                depth=self._tag_depth(tag),
+            ))
+    def handle_endtag(self, tag: str) -> None:
+        tag = tag.lower()
+        if self._text_skip:
+            if tag == self._text_skip[-1]:
+                self._text_skip.pop()
+            return
+        line = self.getpos()[0]
+        if self._heading and tag == self._heading.tag:
+            heading = self._heading
+            text = _clean("".join(heading.text_parts))
+            depth = self._heading_depth(heading)
+            self.headings.append((
+                heading.start,
+                heading.start_col,
+                heading.level,
+                f"{'  ' * depth}<{heading.tag}>{text}</{heading.tag}>",
+            ))
+            for node in reversed(self._stack):
+                if node.tag in _LANDMARKS and not node.heading_text:
+                    node.heading_text = text
+                    break
+            self._heading = None
+        elif self._title and tag == "title":
+            title = self._title
+            text = _clean("".join(title.text_parts))
+            if text:
+                self.titles.append((title.start, title.start_col, OutlineItem(
+                    start=title.start,
+                    count=max(1, line - title.start + 1),
+                    signature=f"{'  ' * title.base_depth}<title>{text}</title>",
+                )))
+            self._title = None
+        if tag in _OUTLINE_TAGS:
+            for idx in range(len(self._stack) - 1, -1, -1):
+                node = self._stack[idx]
+                if node.tag == tag:
+                    node.end = line
+                    del self._stack[idx:]
+                    break
+    def handle_data(self, data: str) -> None:
+        if self._heading:
+            self._heading.text_parts.append(data)
+        elif self._title:
+            self._title.text_parts.append(data)
+        elif not self._text_skip:
+            self._add_text(data)
+    def handle_entityref(self, name: str) -> None:
+        if self._heading:
+            self._heading.text_parts.append(f"&{name};")
+        elif self._title:
+            self._title.text_parts.append(f"&{name};")
+        elif not self._text_skip:
+            self._add_text(f"&{name};", glue=True)
+    def handle_charref(self, name: str) -> None:
+        if self._heading:
+            self._heading.text_parts.append(f"&#{name};")
+        elif self._title:
+            self._title.text_parts.append(f"&#{name};")
+        elif not self._text_skip:
+            self._add_text(f"&#{name};", glue=True)
+    def close(self) -> None:
+        super().close()
+        if self._heading:
+            heading = self._heading
+            text = _clean("".join(heading.text_parts))
+            depth = self._heading_depth(heading)
+            self.headings.append((
+                heading.start,
+                heading.start_col,
+                heading.level,
+                f"{'  ' * depth}<{heading.tag}>{text}</{heading.tag}>",
+            ))
+            self._heading = None
+        if self._title:
+            title = self._title
+            text = _clean("".join(title.text_parts))
+            if text:
+                self.titles.append((title.start, title.start_col, OutlineItem(
+                    start=title.start,
+                    count=max(1, self.line_count - title.start + 1),
+                    signature=f"{'  ' * title.base_depth}<title>{text}</title>",
+                )))
+            self._title = None
+        for node in self._stack:
+            node.end = self.line_count
+    def _inside_content(self) -> bool:
+        return any(node.tag in _CONTENT_TAGS for node in self._stack)
+    def _inside_document_head(self) -> bool:
+        return any(node.tag == "head" for node in self._stack) and not self._inside_content()
+    def _outline_depth(self) -> int:
+        return len([
+            node for node in self._stack
+            if node.has_identity and node.tag not in _STRUCTURAL
+        ])
+    def _tag_depth(self, tag: str) -> int:
+        return 0 if tag in _STRUCTURAL else self._outline_depth() + 1
+    def _heading_context_node(self) -> _Node | None:
+        for node in reversed(self._stack):
+            if node.tag not in _STRUCTURAL and node.tag in _OUTLINE_TAGS:
+                return node
+        return None
+    def _heading_base_depth(self) -> int:
+        node = self._heading_context_node()
+        return node.depth + 1 if node else 1
+    def _heading_context_key(self) -> int:
+        node = self._heading_context_node()
+        return id(node) if node else 0
+    def _heading_depth(self, heading: _Heading) -> int:
+        stack = self._heading_stacks.setdefault(heading.context_key, [])
+        while stack and stack[-1][0] >= heading.level:
+            stack.pop()
+        depth = stack[-1][1] + 1 if stack else heading.base_depth
+        stack.append((heading.level, depth))
+        return depth
+    def _add_text(self, text: str, glue: bool = False) -> None:
+        if not text.strip():
+            return
+        for node in reversed(self._stack):
+            if node.tag in _LANDMARKS:
+                if node.text_parts and not (glue or node.text_parts[-1].endswith(";")):
+                    node.text_parts.append(" ")
+                node.text_parts.append(text)
+                break
+def _is_heading(tag: str) -> bool:
+    return len(tag) == 2 and tag[0] == "h" and tag[1] in "123456"
+def _clean(text: str) -> str:
+    return _WS_RE.sub(" ", html.unescape(text)).strip()
+def _block_sig(tag: str, attrs: dict[str, str], fallback_text: str = "", depth: int = 0) -> str:
+    ident = f"#{attrs['id']}" if attrs.get("id") else ""
+    label = _clean(attrs.get("aria-label", ""))
+    label_attr = f' aria-label="{label}"' if label else ""
+    excerpt = _excerpt(fallback_text) if tag in _LANDMARKS else ""
+    text = excerpt if excerpt and not label else ""
+    return f"{'  ' * depth}<{tag}{ident}{label_attr}>{text}"
+def _excerpt(text: str) -> str:
+    text = _clean(text)
+    if not text:
+        return ""
+    first = _SENTENCE_RE.split(text, maxsplit=1)[0]
+    if first.lower() in _BORING_EXCERPTS:
+        return ""
+    return first if len(first) <= _EXCERPT_LIMIT else first[:_EXCERPT_LIMIT - 3].rstrip() + "..."
+def _items_from_headings(
+    headings: list[tuple[int, int, int, str]],
+    line_count: int,
+) -> Iterator[tuple[int, int, OutlineItem]]:
+    for idx, (line, column, level, signature) in enumerate(headings):
+        end = line_count + 1
+        for future_line, _, future_level, _ in headings[idx + 1:]:
+            if future_level <= level:
+                end = future_line
+                break
+        yield (line, column, OutlineItem(start=line, count=max(1, end - line), signature=signature))
+def detect(lines: list[str]) -> bool:
+    for line in lines[:30]:
+        if _DOCTYPE_RE.match(line) or _HTML_TAG_RE.match(line):
+            return True
+    return False
+def parse(text: str) -> Iterator[OutlineItem]:
+    line_count = len(text.splitlines())
+    parser = _Parser(line_count)
+    parser.feed(text)
+    parser.close()
+    block_items = (
+        (node.start, node.start_col, OutlineItem(
+            start=node.start,
+            count=max(1, (node.end or line_count) - node.start + 1),
+            signature=node.signature,
+        ))
+        for node in parser.nodes if node.has_identity
+    )
+    events = [*parser.titles, *_items_from_headings(parser.headings, line_count), *block_items]
+    for _, _, item in sorted(events, key=lambda event: (event[0], event[1])):
+        yield item

outliner-cli 0.2.0__tar.gz → 0.3.0__tar.gz

outliner-cli 0.2.0tar.gz → 0.3.0tar.gz