PyPI - org-parser - Versions diffs - 0.23.5__py3-none-any.whl - Mend

org-parser 0.23.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

org_parser/__init__.py +116 -0
org_parser/_from_source.py +78 -0
org_parser/_lang.py +12 -0
org_parser/_node.py +81 -0
org_parser/_nodes.py +231 -0
org_parser/document/__init__.py +16 -0
org_parser/document/_body.py +156 -0
org_parser/document/_document.py +1133 -0
org_parser/document/_heading.py +1653 -0
org_parser/document/_loader.py +41 -0
org_parser/element/__init__.py +83 -0
org_parser/element/_babel.py +172 -0
org_parser/element/_block.py +1144 -0
org_parser/element/_dirty_list.py +60 -0
org_parser/element/_dispatch.py +131 -0
org_parser/element/_drawer.py +565 -0
org_parser/element/_element.py +460 -0
org_parser/element/_keyword.py +397 -0
org_parser/element/_list.py +787 -0
org_parser/element/_paragraph.py +103 -0
org_parser/element/_structure.py +324 -0
org_parser/element/_structure_recovery.py +70 -0
org_parser/element/_table.py +445 -0
org_parser/py.typed +0 -0
org_parser/text/__init__.py +63 -0
org_parser/text/_inline.py +392 -0
org_parser/text/_rich_text.py +659 -0
org_parser/time/__init__.py +6 -0
org_parser/time/_clock.py +190 -0
org_parser/time/_timestamp.py +1037 -0
org_parser-0.23.5.dist-info/METADATA +202 -0
org_parser-0.23.5.dist-info/RECORD +34 -0
org_parser-0.23.5.dist-info/WHEEL +4 -0
org_parser-0.23.5.dist-info/licenses/LICENSE +21 -0

org_parser/__init__.py ADDED Viewed

@@ -0,0 +1,116 @@
+"""org_parser — Python bindings for the tree-sitter org-mode parser.
+This package provides convenience helpers for loading and dumping Org Mode
+documents as [org_parser.document.Document][] instances.
+"""
+from __future__ import annotations
+from pathlib import Path
+from org_parser._lang import PARSER
+from org_parser.document import Document
+__all__ = ["Document", "dump", "dumps", "load", "loads"]
+def load(filename: str) -> Document:
+    """Load an Org Mode document from a file.
+    Args:
+        filename: Path to the Org Mode file.
+    Returns:
+        Parsed [org_parser.document.Document][] instance.
+    Example:
+    ```python
+    >>> from org_parser import load
+    >>> document = load('path/to/file.org')
+    >>> document.children[0].title_text
+    'Some heading'
+    ```
+    """
+    path = Path(filename)
+    source = path.read_bytes()
+    tree = PARSER.parse(source)
+    return Document.from_tree(tree, filename, source)
+def loads(source: str, filename: str | None = None) -> Document:
+    """Load an Org Mode document from a string.
+    Args:
+        source: Org Mode text to parse.
+        filename: Optional filename to assign to the parsed document.
+    Returns:
+        Parsed [org_parser.document.Document][] instance.
+    Example:
+    ```python
+    >>> from org_parser import loads
+    >>> document = loads("* TODO Heading 1")
+    >>> document.children[0].todo
+    'TODO'
+    ```
+    """
+    assigned_filename = filename if filename is not None else ""
+    source_bytes = source.encode()
+    tree = PARSER.parse(source_bytes)
+    return Document.from_tree(tree, assigned_filename, source_bytes)
+def dumps(document: Document) -> str:
+    """Return Org Mode text for a parsed document.
+    Produces the complete document text including all headings.  For clean
+    (unmodified) parse-backed documents the original source is returned
+    verbatim; for dirty documents every section is reconstructed from its
+    semantic fields.
+    Args:
+        document: Parsed document instance.
+    Returns:
+        Full Org Mode source text.
+    Example:
+    ```python
+    >>> from org_parser import dumps, loads
+    >>> document = loads("* TODO Heading 1")
+    >>> dumps(document).startswith("* TODO")
+    True
+    ```
+    """
+    return document.render()
+def dump(document: Document, filename: str | None = None) -> None:
+    """Write a parsed document to disk.
+    The output path is *filename* when provided; otherwise
+    [document.filename][org_parser.document.Document.filename].
+    Args:
+        document: Parsed document instance.
+        filename: Optional output path.
+    Raises:
+        ValueError: If neither *filename* nor ``document.filename`` is set.
+    Example:
+    ```python
+    >>> from pathlib import Path
+    >>> from org_parser import dump, loads
+    >>> document = loads("* TODO Heading 1")
+    >>> dump(document, 'path/to/file.org')
+    >>> out = Path('path/to/file.org')
+    >>> out.read_text().startswith("* TODO")
+    True
+    ```
+    """
+    target = filename if filename is not None else document.filename
+    if target == "":
+        raise ValueError("No output filename provided")
+    Path(target).write_text(dumps(document))

org_parser/_from_source.py ADDED Viewed

@@ -0,0 +1,78 @@
+"""Shared strict parsing helpers for ``from_source`` constructors.
+These helpers centralize parse-then-extract flows used by semantic
+``from_source`` class methods.
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING, TypeVar
+from org_parser._lang import PARSER
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    from org_parser.document._document import Document
+__all__ = ["parse_document_from_source", "parse_source_with_extractor"]
+_ExtractedT = TypeVar("_ExtractedT")
+def parse_document_from_source(source: str, *, filename: str = "") -> Document:
+    """Parse *source* and return a strict parse-backed :class:`Document`.
+    Args:
+        source: Org source text to parse.
+        filename: Optional filename assigned to the parsed document.
+    Returns:
+        The parsed semantic :class:`Document`.
+    Raises:
+        ValueError: If the parse tree contains any error or missing nodes.
+    """
+    source_bytes = source.encode()
+    tree = PARSER.parse(source_bytes)
+    from org_parser.document._document import Document
+    document = Document.from_tree(tree, filename, source_bytes)
+    if len(document.errors) > 0:
+        raise ValueError("Source contains parse errors")
+    return document
+def parse_source_with_extractor(
+    source: str,
+    *,
+    extractor: Callable[[Document], _ExtractedT | None],
+) -> tuple[_ExtractedT, Document]:
+    """Parse *source*, validate syntax, and extract one semantic value.
+    Args:
+        source: Org source text to parse.
+        extractor: Callback that receives ``document`` and returns
+            the specific semantic value to return.
+    Returns:
+        A ``(extracted, document)`` tuple.
+    Raises:
+        ValueError: If the source cannot be parsed cleanly or no valid value is
+            extracted.
+    """
+    source_bytes = source.encode()
+    tree = PARSER.parse(source_bytes)
+    from org_parser.document._document import Document
+    document = Document.from_tree(tree, "", source_bytes)
+    if len(document.errors) > 0:
+        raise ValueError("Source contains parse errors")
+    extracted = extractor(document)
+    if extracted is None:
+        raise ValueError("Unexpected parse tree structure")
+    return extracted, document

org_parser/_lang.py ADDED Viewed

@@ -0,0 +1,12 @@
+"""Internal: tree-sitter Language and Parser singletons for Org Mode."""
+from tree_sitter import Language, Parser
+import tree_sitter_org
+__all__ = ["ORG_LANGUAGE", "PARSER"]
+#: The Org Mode :class:`~tree_sitter.Language` instance (module-level singleton).
+ORG_LANGUAGE: Language = Language(tree_sitter_org.language())
+#: A :class:`~tree_sitter.Parser` pre-configured with :data:`ORG_LANGUAGE`.
+PARSER: Parser = Parser(ORG_LANGUAGE)

org_parser/_node.py ADDED Viewed

@@ -0,0 +1,81 @@
+"""Shared tree-sitter node utilities.
+These helpers centralise recurring patterns for inspecting and extracting
+decoded source text from tree-sitter nodes:
+* :func:`is_error_node` — classify a node as an error or missing token.
+* :func:`node_text` — when you already hold the raw ``bytes`` source buffer
+  (e.g. inside ``from_node`` factory methods).
+* :func:`node_source` — when you hold a
+  :class:`~org_parser.document._document.Document` reference and need to
+  reach back into it (e.g. inside ``__str__`` methods on element objects).
+Both text functions return an empty string rather than raising when the node
+or document argument is ``None``, so callers do not need separate guard
+clauses.
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    import tree_sitter
+    from org_parser.document._document import Document
+__all__ = ["is_error_node", "node_source", "report_internal_parse_errors"]
+_ERROR_NODE_TYPE = "ERROR"
+def is_error_node(node: tree_sitter.Node) -> bool:
+    """Return *True* if *node* is a parse-error or missing token.
+    Args:
+        node: Any tree-sitter node to inspect.
+    Returns:
+        ``True`` for ``ERROR``-typed nodes and for nodes where
+        ``node.is_missing`` is set by the parser's error-recovery.
+    """
+    return node.type == _ERROR_NODE_TYPE or node.is_missing
+def node_source(node: tree_sitter.Node | None, document: Document | None) -> str:
+    """Return the decoded source text of *node* within *document*.
+    Args:
+        node: A tree-sitter node, or ``None`` for programmatically constructed
+            elements that carry no parse-tree backing.
+        document: The owning :class:`~org_parser.document._document.Document`,
+            or ``None``.
+    Returns:
+        The decoded source slice, or an empty string when either argument is
+        ``None``.
+    Raises:
+        ValueError: If the provided document has no backing source bytes.
+    """
+    if node is None or document is None:
+        return ""
+    return document.source_for(node).decode()
+def report_internal_parse_errors(node: tree_sitter.Node, document: Document) -> None:
+    """Report top-level parse-error descendants inside *node*.
+    This records ``ERROR`` and missing nodes nested within a semantic object's
+    parse subtree so object-internal parse issues propagate to
+    :attr:`Document.errors`. Only top-level error regions are reported to avoid
+    duplicate nested entries for the same malformed segment.
+    """
+    stack: list[tuple[tree_sitter.Node, bool]] = [(node, False)]
+    while stack:
+        current, has_error_ancestor = stack.pop()
+        current_is_error = is_error_node(current)
+        if current_is_error and not has_error_ancestor:
+            document.report_error(current)
+        child_has_error_ancestor = has_error_ancestor or current_is_error
+        stack.extend((child, child_has_error_ancestor) for child in reversed(current.children))

org_parser/_nodes.py ADDED Viewed

@@ -0,0 +1,231 @@
+"""Tree-sitter grammar node-type name constants for the Org Mode grammar.
+All string constants here correspond to ``node.type`` values produced by the
+compiled Org Mode tree-sitter grammar.  Centralising them here prevents the
+same literal from being scattered across every module that dispatches on node
+types.
+Constants are grouped by semantic domain and kept in alphabetical order within
+each group.
+"""
+from __future__ import annotations
+__all__ = [
+    "ANGLE_LINK",
+    "AUTHOR",
+    "BABEL_CALL",
+    "BLANK_LINE",
+    "BOLD",
+    "CAPTION_KEYWORD",
+    "CATEGORY",
+    "CENTER_BLOCK",
+    "CITATION",
+    "CLOCK",
+    "CLOSED",
+    "CODE",
+    "COMMENT",
+    "COMMENT_BLOCK",
+    "COMPLETION_COUNTER",
+    "DEADLINE",
+    "DELAY_MARK",
+    "DESCRIPTION",
+    "DRAWER",
+    "DYNAMIC_BLOCK",
+    "ENTITY",
+    "EXAMPLE_BLOCK",
+    "EXPORT_BLOCK",
+    "EXPORT_SNIPPET",
+    "FIXED_WIDTH",
+    "FOOTNOTE_REFERENCE",
+    "HEADING",
+    "HORIZONTAL_RULE",
+    "INDENT",
+    "INLINE_BABEL_CALL",
+    "INLINE_HEADERS",
+    "INLINE_SOURCE_BLOCK",
+    "ITALIC",
+    "LINE_BREAK",
+    "LIST",
+    "LIST_ITEM",
+    "LOGBOOK_DRAWER",
+    "MACRO",
+    "MACRO_ARGUMENTS",
+    "MACRO_NAME",
+    "NODE_PROPERTY",
+    "ORG_TABLE",
+    "PARAGRAPH",
+    "PLAIN_LINK",
+    "PLAIN_TEXT",
+    "PLANNING",
+    "PLANNING_KEYWORD",
+    "PLOT_KEYWORD",
+    "PROPERTY_DRAWER",
+    "QUOTE_BLOCK",
+    "RADIO_TARGET",
+    "REGULAR_LINK",
+    "REPEATER_MARK",
+    "RESULTS_KEYWORD",
+    "SCHEDULED",
+    "SPECIAL_BLOCK",
+    "SPECIAL_KEYWORD",
+    "SRC_BLOCK",
+    "STRIKE_THROUGH",
+    "SUBSCRIPT",
+    "SUPERSCRIPT",
+    "TABLEEL_TABLE",
+    "TABLE_CELL",
+    "TABLE_ROW",
+    "TABLE_RULE",
+    "TAG",
+    "TARGET",
+    "TBLFM_LINE",
+    "TBLNAME_KEYWORD",
+    "TIMESTAMP",
+    "TIME_UNIT",
+    "TITLE",
+    "TODO",
+    "TS_DAY",
+    "TS_DAYNAME",
+    "TS_MONTH",
+    "TS_TIME",
+    "TS_YEAR",
+    "UNDERLINE",
+    "VERBATIM",
+    "VERSE_BLOCK",
+    "ZEROTH_SECTION",
+]
+# ---------------------------------------------------------------------------
+# Document / section structure
+# ---------------------------------------------------------------------------
+HEADING = "heading"
+ZEROTH_SECTION = "zeroth_section"
+# ---------------------------------------------------------------------------
+# Planning
+# ---------------------------------------------------------------------------
+PLANNING = "planning"
+PLANNING_KEYWORD = "planning_keyword"
+TIMESTAMP = "timestamp"
+# Planning keyword values — the text content of ``planning_keyword`` nodes
+# (e.g. the word ``SCHEDULED`` in the source), not grammar node types.
+SCHEDULED = "SCHEDULED"
+DEADLINE = "DEADLINE"
+CLOSED = "CLOSED"
+# ---------------------------------------------------------------------------
+# Timestamp sub-nodes
+# ---------------------------------------------------------------------------
+TS_DAY = "ts_day"
+TS_DAYNAME = "ts_dayname"
+TS_MONTH = "ts_month"
+TS_TIME = "ts_time"
+TS_YEAR = "ts_year"
+DELAY_MARK = "delay_mark"
+REPEATER_MARK = "repeater_mark"
+TIME_UNIT = "time_unit"
+# ---------------------------------------------------------------------------
+# Heading components
+# ---------------------------------------------------------------------------
+COMPLETION_COUNTER = "completion_counter"
+TAG = "tag"
+# ---------------------------------------------------------------------------
+# Keywords
+# ---------------------------------------------------------------------------
+SPECIAL_KEYWORD = "special_keyword"
+# Special keyword values — the upper-cased key text of ``special_keyword``
+# nodes (e.g. the word ``TITLE`` in ``#+TITLE:``), not grammar node types.
+TITLE = "TITLE"
+AUTHOR = "AUTHOR"
+CATEGORY = "CATEGORY"
+DESCRIPTION = "DESCRIPTION"
+TODO = "TODO"
+FILETAGS = "FILETAGS"
+# ---------------------------------------------------------------------------
+# Drawers
+# ---------------------------------------------------------------------------
+DRAWER = "drawer"
+LOGBOOK_DRAWER = "logbook_drawer"
+NODE_PROPERTY = "node_property"
+PROPERTY_DRAWER = "property_drawer"
+# ---------------------------------------------------------------------------
+# Element types
+# ---------------------------------------------------------------------------
+BABEL_CALL = "babel_call"
+BLANK_LINE = "blank_line"
+CAPTION_KEYWORD = "caption_keyword"
+CENTER_BLOCK = "center_block"
+CLOCK = "clock"
+COMMENT = "comment"
+COMMENT_BLOCK = "comment_block"
+DYNAMIC_BLOCK = "dynamic_block"
+EXAMPLE_BLOCK = "example_block"
+EXPORT_BLOCK = "export_block"
+FIXED_WIDTH = "fixed_width"
+HORIZONTAL_RULE = "horizontal_rule"
+INDENT = "indent"
+LIST = "list"
+LIST_ITEM = "list_item"
+ORG_TABLE = "org_table"
+PARAGRAPH = "paragraph"
+PLOT_KEYWORD = "plot_keyword"
+QUOTE_BLOCK = "quote_block"
+RESULTS_KEYWORD = "results_keyword"
+SPECIAL_BLOCK = "special_block"
+SRC_BLOCK = "src_block"
+TABLEEL_TABLE = "tableel_table"
+TBLNAME_KEYWORD = "tblname_keyword"
+VERSE_BLOCK = "verse_block"
+# ---------------------------------------------------------------------------
+# Table sub-nodes
+# ---------------------------------------------------------------------------
+TABLE_CELL = "table_cell"
+TABLE_ROW = "table_row"
+TABLE_RULE = "table_rule"
+TBLFM_LINE = "tblfm_line"
+# ---------------------------------------------------------------------------
+# Inline object types
+# ---------------------------------------------------------------------------
+ANGLE_LINK = "angle_link"
+BOLD = "bold"
+ENTITY = "entity"
+CITATION = "citation"
+CODE = "code"
+EXPORT_SNIPPET = "export_snippet"
+FOOTNOTE_REFERENCE = "footnote_reference"
+INLINE_BABEL_CALL = "inline_babel_call"
+INLINE_HEADERS = "inline_headers"
+INLINE_SOURCE_BLOCK = "inline_source_block"
+ITALIC = "italic"
+LINE_BREAK = "line_break"
+MACRO = "macro"
+MACRO_ARGUMENTS = "macro_arguments"
+MACRO_NAME = "macro_name"
+PLAIN_LINK = "plain_link"
+PLAIN_TEXT = "plain_text"
+RADIO_TARGET = "radio_target"
+REGULAR_LINK = "regular_link"
+STRIKE_THROUGH = "strike_through"
+SUBSCRIPT = "subscript"
+SUPERSCRIPT = "superscript"
+TARGET = "target"
+UNDERLINE = "underline"
+VERBATIM = "verbatim"

org_parser/document/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""Document-level parsing, semantic classes, and raw tree access.
+This subpackage provides:
+* [org_parser.document.Document][] — the top-level semantic representation of an Org file,
+  including keyword properties (``TITLE``, ``AUTHOR``, …), the zeroth-section
+  body, and top-level headings.
+* [org_parser.document.Heading][] — a heading / sub-heading with its parsed components
+  (level, TODO state, priority, title, tags, body, sub-headings).
+"""
+from org_parser.document._document import Document, ParseError
+from org_parser.document._heading import Heading
+from org_parser.document._loader import load_raw
+__all__ = ["Document", "Heading", "ParseError", "load_raw"]

org_parser/document/_body.py ADDED Viewed

@@ -0,0 +1,156 @@
+"""Shared body-extraction helpers.
+Used mostly by [org_parser.document.Document][] and [org_parser.document.Heading][].
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from org_parser._node import is_error_node, node_source
+from org_parser._nodes import INDENT
+from org_parser.element import Logbook, Properties, Repeat
+from org_parser.element._dispatch import body_element_factories
+from org_parser.element._element import Element, element_from_error_or_unknown
+from org_parser.element._structure import Indent
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    import tree_sitter
+    from org_parser.document._document import Document
+    from org_parser.document._heading import Heading
+    from org_parser.text._rich_text import RichText
+    from org_parser.time import Clock
+# NOTE: Callable is kept in TYPE_CHECKING for the dispatch dict type annotations.
+__all__ = [
+    "extract_body_element",
+    "extract_indent",
+    "merge_logbook_drawers",
+    "merge_properties_drawers",
+]
+def merge_properties_drawers(
+    drawers: list[Properties],
+    *,
+    parent: Heading | Document,
+) -> Properties | None:
+    """Merge repeated properties drawers into one object.
+    Args:
+        drawers: All collected [org_parser.element.Properties][] drawers in source order.
+        parent: Owner object to assign to the merged drawer.
+    Returns:
+        A single merged [org_parser.element.Properties][], or ``None`` when *drawers* is
+        empty. Later drawers override earlier entries for the same key.
+    """
+    if not drawers:
+        return None
+    merged_values: dict[str, RichText] = {}
+    for drawer in drawers:
+        for key, value in drawer.items():
+            if key in merged_values:
+                del merged_values[key]
+            merged_values[key] = value
+    return Properties(properties=merged_values, parent=parent)
+def merge_logbook_drawers(
+    drawers: list[Logbook],
+    *,
+    parent: Heading | Document,
+) -> Logbook | None:
+    """Merge repeated logbook drawers into one object.
+    Args:
+        drawers: All collected [org_parser.element.Logbook][] drawers in source order.
+        parent: Owner object to assign to the merged drawer.
+    Returns:
+        A single merged [org_parser.element.Logbook][], or ``None`` when *drawers* is empty.
+    """
+    if not drawers:
+        return None
+    merged_body: list[Element] = []
+    merged_clocks: list[Clock] = []
+    merged_repeats: list[Repeat] = []
+    for drawer in drawers:
+        merged_body.extend(drawer.body)
+        merged_clocks.extend(drawer.clock_entries)
+        merged_repeats.extend(drawer.repeats)
+    return Logbook(
+        body=merged_body,
+        clock_entries=merged_clocks,
+        repeats=merged_repeats,
+        parent=parent,
+    )
+def extract_body_element(
+    node: tree_sitter.Node,
+    *,
+    parent: Heading | Document,
+    document: Document,
+) -> Element:
+    """Build one body element instance from a tree-sitter node.
+    Error nodes (``ERROR`` type or ``is_missing``) are recovered immediately
+    before dispatch so that callers do not need to guard the call site.
+    Args:
+        node: A tree-sitter child node from a section or zeroth-section.
+        parent: Owner heading or document.
+        document: The owning [org_parser.document.Document][].
+    Returns:
+        A semantic [org_parser.element.Element][] subclass matching *node.type*, or a
+        recovered [org_parser.element.Paragraph][] for
+        error and unrecognised nodes.
+    """
+    if is_error_node(node):
+        return element_from_error_or_unknown(node, document, parent=parent)
+    dispatch: dict[str, Callable[..., Element]] = {
+        **body_element_factories(),
+        INDENT: extract_indent,
+    }
+    factory = dispatch.get(node.type)
+    if factory is None:
+        return element_from_error_or_unknown(node, document, parent=parent)
+    return factory(node, document, parent=parent)
+def extract_indent(
+    node: tree_sitter.Node,
+    document: Document,
+    *,
+    parent: Heading | Document,
+) -> Indent:
+    """Build one [org_parser.element.Indent][] with recursively parsed body nodes.
+    Args:
+        node: A tree-sitter ``indent`` node.
+        document: The owning [org_parser.document.Document][].
+        parent: Owner heading or document.
+    Returns:
+        An [org_parser.element.Indent][] whose body elements are recursively parsed.
+    """
+    indent_node = node.child_by_field_name("indent")
+    indent_text = node_source(indent_node, document)
+    indent = indent_text if indent_text != "" else None
+    block = Indent(
+        body=[
+            extract_body_element(child, parent=parent, document=document)
+            for child in node.children_by_field_name("body")
+            if child.is_named
+        ],
+        indent=indent,
+        parent=parent,
+    )
+    block.attach_source(node, document)
+    return block