PyPI - oaknut-basic - Versions diffs - 12.8.2__tar.gz → 12.9.0__tar.gz - Mend

oaknut-basic 12.8.2tar.gz → 12.9.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

{oaknut_basic-12.8.2/src/oaknut_basic.egg-info → oaknut_basic-12.9.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: oaknut-basic
-Version: 12.8.2
+Version: 12.9.0
 Summary: BBC BASIC tools: program tokeniser/de-tokeniser and PRINT#/INPUT# data-file reader/writer
 Author-email: Robert Smallshire <robert@smallshire.org.uk>
 License-Expression: MIT

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut/basic/__init__.py RENAMED Viewed

@@ -30,7 +30,7 @@ from oaknut.basic.datafile import (
     BbcBasicDataReader,
     BbcBasicDataWriter,
 )
-from oaknut.basic.detokeniser import detokenise
+from oaknut.basic.detokeniser import detokenise, detokenise_body
 from oaknut.basic.exceptions import (
     AlreadyNumberedError,
     BASICError,
@@ -52,14 +52,34 @@ from oaknut.basic.exceptions import (
     UnnumberedLineError,
 )
 from oaknut.basic.float5 import pack_float5, unpack_float5
+from oaknut.basic.linenumber import decode_line_number
 from oaknut.basic.numbering import (
     DEFAULT_LINE_NUMBER,
     DEFAULT_LINE_STEP,
     number_lines,
 )
+from oaknut.basic.scanner import (
+    LineRecord,
+    Token,
+    TokenKind,
+    scan,
+    scan_program,
+)
 from oaknut.basic.tokeniser import tokenise
+from oaknut.basic.tokens import (
+    FLAG_CONDITIONAL,
+    FLAG_FN_PROC,
+    FLAG_LINE_NUMBER,
+    FLAG_MIDDLE,
+    FLAG_PSEUDO_VAR,
+    FLAG_START,
+    FLAG_STOP_LINE,
+    KEYWORDS,
+    LINE_NUMBER_TOKEN,
+    TOKEN_TO_KEYWORD,
+)
-__version__ = "12.8.2"
+__version__ = "12.9.0"
 # Canonical load addresses for BBC BASIC programs on each host.
 # Programs saved by *SAVE on a real machine use these by default.
@@ -71,6 +91,16 @@ __all__ = [
     "DEFAULT_LINE_NUMBER",
     "DEFAULT_LINE_STEP",
     "ELECTRON_BASIC_LOAD_ADDRESS",
+    "FLAG_CONDITIONAL",
+    "FLAG_FN_PROC",
+    "FLAG_LINE_NUMBER",
+    "FLAG_MIDDLE",
+    "FLAG_PSEUDO_VAR",
+    "FLAG_START",
+    "FLAG_STOP_LINE",
+    "KEYWORDS",
+    "LINE_NUMBER_TOKEN",
+    "TOKEN_TO_KEYWORD",
     "AlreadyNumberedError",
     "BASICError",
     "BbcBasicDataFile",
@@ -85,17 +115,24 @@ __all__ = [
     "InvalidLineLengthError",
     "LineNumberOrderError",
     "LineNumberRangeError",
+    "LineRecord",
     "LineTooLongError",
     "MissingLineMarkerError",
     "StringTooLongError",
+    "Token",
+    "TokenKind",
     "TokeniseError",
     "TruncatedProgramError",
     "TruncatedRecordError",
     "UnknownTagError",
     "UnnumberedLineError",
+    "decode_line_number",
     "detokenise",
+    "detokenise_body",
     "number_lines",
     "pack_float5",
+    "scan",
+    "scan_program",
     "unpack_float5",
     "tokenise",
 ]

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut/basic/cli.py RENAMED Viewed

@@ -190,10 +190,11 @@ def _listing_to_bytes(listing: str, encoding: str) -> bytes:
 )
 @click.option(
     "--encoding",
-    default="acorn",
+    default="utf-8",
     show_default=True,
     callback=_validate_encoding,
-    help="Text encoding of the INPUT source. Defaults to the BBC character set.",
+    help='Text encoding of the INPUT source. Use "acorn" for the BBC '
+    "character set (e.g. source taken straight off a disc image).",
 )
 @click.option(
     "--start",
@@ -226,8 +227,9 @@ def tokenise(
         oaknut-basic tokenise --start 10 unnumbered.bas MENU
-    INPUT is read in --encoding (the BBC ``acorn`` character set by default;
-    pass ``utf-8`` for source authored in a modern editor).
+    INPUT is read in --encoding (``utf-8`` by default, for source authored
+    in a modern editor; pass ``acorn`` for the BBC character set, e.g.
+    source taken straight off a disc image).
     """
     from oaknut.basic import tokenise as tokenise_source
@@ -252,10 +254,11 @@ def tokenise(
 )
 @click.option(
     "--encoding",
-    default="acorn",
+    default="utf-8",
     show_default=True,
     callback=_validate_encoding,
-    help="Text encoding for the OUTPUT source. Defaults to the BBC character set.",
+    help='Text encoding for the OUTPUT source. Use "acorn" for the BBC '
+    "character set with CR line endings (e.g. writing back to a disc image).",
 )
 def detokenise(input_stream, output_stream, encoding: str) -> None:
     """De-tokenise a stored BBC BASIC program into source text.
@@ -267,8 +270,9 @@ def detokenise(input_stream, output_stream, encoding: str) -> None:
         oaknut-basic detokenise MENU menu.bas
         disc get game.ssd MENU - | oaknut-basic detokenise
-    OUTPUT is written in --encoding (the BBC ``acorn`` character set with CR
-    line endings by default; pass ``utf-8`` for a host text file).
+    OUTPUT is written in --encoding (``utf-8`` with host-native ``LF`` line
+    endings by default, for a host text file; pass ``acorn`` for the BBC
+    character set with ``CR`` endings, e.g. writing back to a disc image).
     """
     from oaknut.basic import detokenise as detokenise_program

oaknut_basic-12.9.0/src/oaknut/basic/detokeniser.py ADDED Viewed

@@ -0,0 +1,73 @@
+"""De-tokenise BBC BASIC II bytes into source text.
+The de-tokeniser is the *text projection* of the token scanner
+(:mod:`oaknut.basic.scanner`): :func:`detokenise` mirrors the ROM's
+``LIST`` walk over the ``&0D``-framed stored form — rendering each line's
+number followed by its body — while :func:`detokenise_body` flattens one
+unframed line body. Both join the body :class:`~oaknut.basic.Token`
+stream's values: keyword tokens expand to their table spelling, ``&8D``
+references decode back to plain decimal, and bytes inside a quoted string
+are emitted verbatim (no token expansion). Where keyword *adjacency*
+matters — ``?QPLOT`` is not re-typeable as a program — reach past the
+text for :func:`oaknut.basic.scan` instead.
+The returned string uses latin-1/code-point semantics: a stored byte
+``b`` becomes ``chr(b)``, so the text round-trips byte-exactly through
+:func:`oaknut.basic.tokenise`. Mapping to a host character set (Acorn,
+UTF-8) is the caller's job — the CLI does it at its I/O boundary.
+The line layout (from the disassembly) is::
+    &0D <lineNo-hi> <lineNo-lo> <length> <body...>   ...   &0D &FF
+where *length* counts the whole record from this ``&0D`` up to (not
+including) the next one, so ``length = 4 + len(body)``.
+"""
+from __future__ import annotations
+from oaknut.basic.scanner import scan, scan_program
+def detokenise(data: bytes) -> str:
+    """De-tokenise a BBC BASIC II program into source text.
+    Args:
+        data: A tokenised program, as stored on disc or in memory,
+            terminated by the ``&0D &FF`` end marker.
+    Returns:
+        The program as numbered source text, lines separated by ``"\\n"``
+        with a trailing newline. Empty for an empty program.
+    Raises:
+        DetokeniseError: The token stream is malformed — a missing line
+            marker, a truncated header or reference, or an impossible
+            length byte.
+    """
+    return "".join(
+        f"{record.line_number}{''.join(token.value for token in record.tokens)}\n"
+        for record in scan_program(data)
+    )
+def detokenise_body(data: bytes) -> str:
+    """De-tokenise an unframed line body into text.
+    Unlike :func:`detokenise`, this expects a line *body* — inline token
+    bytes with no ``&0D`` framing and no leading line number, the form
+    BBC Micro Bot / Owlet programs and AUTO-style entry arrive in. It is
+    the text join of :func:`oaknut.basic.scan`; when keyword adjacency
+    matters, consume that token stream directly instead.
+    Args:
+        data: A line body of inline token bytes.
+    Returns:
+        The body as source text, using latin-1/code-point semantics so it
+        round-trips byte-exactly through :func:`oaknut.basic.tokenise`.
+    Raises:
+        DetokeniseError: A ``&8D`` line-number reference is truncated.
+    """
+    return "".join(token.value for token in scan(data))

oaknut_basic-12.9.0/src/oaknut/basic/scanner.py ADDED Viewed

@@ -0,0 +1,192 @@
+"""Scan tokenised BBC BASIC II bytes into a position-aware token stream.
+De-tokenising to text is lossy for keyword adjacency: the body
+``?Q <PLOT> ?Q`` renders as the text ``?QPLOT?Q``, which the ROM
+tokeniser re-reads as ``?`` followed by the *variable* ``QPLOT`` — the
+embedded ``PLOT`` keyword is gone (after a non-keyword letter the crunch
+consumes the whole identifier; see :func:`oaknut.basic.tokenise`). The
+only faithful representation of a tokenised program is therefore the
+*bytes*, where ``PLOT`` is a distinct token. :func:`scan` yields that
+stream: keyword tokens resolved to their spelling, ``&8D`` line-number
+references decoded, quoted strings preserved verbatim, and every other
+byte coalesced into literal :attr:`TokenKind.TEXT` runs for the consumer
+to sub-lex.
+The walk mirrors the de-tokeniser's; in fact the de-tokeniser is one
+projection of this stream — joining every token's :attr:`Token.value`
+reproduces a body's text exactly, which is how
+:func:`oaknut.basic.detokenise_body` is defined. :func:`scan` works on a
+line *body* (inline tokens, no framing); :func:`scan_program` walks the
+``&0D``-framed stored form and yields a :class:`LineRecord` per line.
+"""
+from __future__ import annotations
+import enum
+from collections.abc import Iterator
+from dataclasses import dataclass
+from oaknut.basic.exceptions import (
+    InvalidLineLengthError,
+    MissingLineMarkerError,
+    TruncatedProgramError,
+)
+from oaknut.basic.linenumber import decode_line_number
+from oaknut.basic.tokens import HEADER_LENGTH, LINE_NUMBER_TOKEN, TOKEN_TO_KEYWORD
+_CR = 0x0D
+_END_MARKER = 0xFF
+_QUOTE = 0x22
+class TokenKind(enum.Enum):
+    """The kind of a scanned :class:`Token`."""
+    #: A keyword token byte, resolved to its spelling via the token table.
+    KEYWORD = "KEYWORD"
+    #: A run of literal bytes — identifiers, numbers, operators,
+    #: punctuation, spaces, and any unknown high byte — for the consumer
+    #: to sub-lex. Each byte ``b`` appears as ``chr(b)``.
+    TEXT = "TEXT"
+    #: A ``"..."`` string literal, copied verbatim through the closing
+    #: quote (or the end of the body, if unterminated).
+    STRING = "STRING"
+    #: A ``&8D`` three-byte line-number reference, decoded to decimal.
+    LINENUM = "LINENUM"
+@dataclass(frozen=True)
+class Token:
+    """One unit of a scanned BBC BASIC line body.
+    Attributes:
+        kind: Which :class:`TokenKind` this is.
+        value: The keyword spelling (``KEYWORD``), the literal run
+            (``TEXT``), the quoted string including its quotes
+            (``STRING``), or the decimal line number (``LINENUM``).
+        start: Byte offset of the token within the scanned data, shifted
+            by ``base_offset``.
+        token: The token byte, for a ``KEYWORD``; ``None`` otherwise.
+    """
+    kind: TokenKind
+    value: str
+    start: int
+    token: int | None = None
+@dataclass(frozen=True)
+class LineRecord:
+    """One line of a scanned stored program.
+    Attributes:
+        line_number: The line's number.
+        tokens: The line body as a tuple of :class:`Token`.
+        start: Byte offset of the line's ``&0D`` marker.
+    """
+    line_number: int
+    tokens: tuple[Token, ...]
+    start: int
+def scan(data: bytes, *, base_offset: int = 0) -> Iterator[Token]:
+    """Scan an unframed line body into a stream of :class:`Token`.
+    Args:
+        data: A line *body* — inline token bytes with no ``&0D`` line
+            framing and no leading line number.
+        base_offset: Added to every token's :attr:`~Token.start`, so
+            offsets can be reported against a larger buffer (for example
+            the line body's position within a whole program).
+    Yields:
+        One :class:`Token` per keyword, literal run, string literal, and
+        ``&8D`` line-number reference, in source order.
+    Raises:
+        DetokeniseError: A ``&8D`` reference is truncated.
+    """
+    n = len(data)
+    i = 0
+    text_start = -1
+    text_chars: list[str] = []
+    def flush() -> Iterator[Token]:
+        nonlocal text_chars, text_start
+        if text_chars:
+            yield Token(TokenKind.TEXT, "".join(text_chars), base_offset + text_start)
+            text_chars = []
+            text_start = -1
+    while i < n:
+        b = data[i]
+        if b == _QUOTE:
+            yield from flush()
+            j = i + 1
+            while j < n and data[j] != _QUOTE:
+                j += 1
+            end = j + 1 if j < n else n
+            yield Token(TokenKind.STRING, "".join(chr(c) for c in data[i:end]), base_offset + i)
+            i = end
+            continue
+        if b == LINE_NUMBER_TOKEN:
+            yield from flush()
+            payload = data[i + 1 : i + 4]
+            if len(payload) != 3:
+                raise TruncatedProgramError(
+                    offset=base_offset + i,
+                    detail="incomplete &8D line-number reference",
+                )
+            yield Token(TokenKind.LINENUM, str(decode_line_number(payload)), base_offset + i)
+            i += 4
+            continue
+        if b >= 0x80 and b in TOKEN_TO_KEYWORD:
+            yield from flush()
+            yield Token(TokenKind.KEYWORD, TOKEN_TO_KEYWORD[b], base_offset + i, b)
+            i += 1
+            continue
+        # A literal byte (< &80) or an unknown high byte (e.g. the &CE
+        # gap): coalesce into a TEXT run so the byte still round-trips.
+        if text_start < 0:
+            text_start = i
+        text_chars.append(chr(b))
+        i += 1
+    yield from flush()
+def scan_program(data: bytes) -> Iterator[LineRecord]:
+    """Walk a ``&0D``-framed stored program, scanning each line body.
+    Args:
+        data: A tokenised program as stored on disc or in memory,
+            terminated by the ``&0D &FF`` end marker.
+    Yields:
+        One :class:`LineRecord` per line, in storage order, each carrying
+        the line number and its body :class:`Token` stream.
+    Raises:
+        DetokeniseError: The framing is malformed — a missing line
+            marker, a truncated header or reference, or an impossible
+            length byte.
+    """
+    n = len(data)
+    i = 0
+    while i < n:
+        if data[i] != _CR:
+            raise MissingLineMarkerError(offset=i, found=data[i])
+        if i + 1 >= n:
+            raise TruncatedProgramError(offset=i, detail="line marker with no line number")
+        if data[i + 1] == _END_MARKER:
+            break
+        if i + HEADER_LENGTH > n:
+            raise TruncatedProgramError(offset=i, detail="incomplete line header")
+        line_number = (data[i + 1] << 8) | data[i + 2]
+        length = data[i + 3]
+        if length < HEADER_LENGTH or i + length > n:
+            raise InvalidLineLengthError(offset=i, length=length)
+        body = data[i + HEADER_LENGTH : i + length]
+        tokens = tuple(scan(body, base_offset=i + HEADER_LENGTH))
+        yield LineRecord(line_number=line_number, tokens=tokens, start=i)
+        i += length

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut/basic/tokens.py RENAMED Viewed

@@ -25,6 +25,8 @@ Two structural facts drive the tables below:
 from __future__ import annotations
+from types import MappingProxyType
 # The line-number reference token. Not a table entry: the tokeniser
 # produces ``&8D`` followed by a 3-byte encoded line number, and the
 # de-tokeniser special-cases it (see :mod:`oaknut.basic.linenumber`).
@@ -192,6 +194,9 @@ _ASSIGNMENT_FORMS: tuple[tuple[str, int], ...] = (
 )
 # token byte -> keyword spelling, covering every entry in the ROM table.
-# &8D (line number) and &CE (unused gap) are deliberately absent.
-TOKEN_TO_KEYWORD: dict[int, str] = {token: keyword for keyword, token, _flags in KEYWORDS}
-TOKEN_TO_KEYWORD.update({token: keyword for keyword, token in _ASSIGNMENT_FORMS})
+# &8D (line number) and &CE (unused gap) are deliberately absent. Exposed
+# read-only: it is public API a byte-level consumer reads but must not
+# mutate (a stray write would corrupt the tokeniser and de-tokeniser).
+_token_to_keyword: dict[int, str] = {token: keyword for keyword, token, _flags in KEYWORDS}
+_token_to_keyword.update({token: keyword for keyword, token in _ASSIGNMENT_FORMS})
+TOKEN_TO_KEYWORD: MappingProxyType[int, str] = MappingProxyType(_token_to_keyword)

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0/src/oaknut_basic.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: oaknut-basic
-Version: 12.8.2
+Version: 12.9.0
 Summary: BBC BASIC tools: program tokeniser/de-tokeniser and PRINT#/INPUT# data-file reader/writer
 Author-email: Robert Smallshire <robert@smallshire.org.uk>
 License-Expression: MIT

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut_basic.egg-info/SOURCES.txt RENAMED Viewed

@@ -9,6 +9,7 @@ src/oaknut/basic/exceptions.py
 src/oaknut/basic/float5.py
 src/oaknut/basic/linenumber.py
 src/oaknut/basic/numbering.py
+src/oaknut/basic/scanner.py
 src/oaknut/basic/tokeniser.py
 src/oaknut/basic/tokens.py
 src/oaknut_basic.egg-info/PKG-INFO
@@ -29,5 +30,6 @@ tests/test_linenumber.py
 tests/test_numbering.py
 tests/test_rom_golden.py
 tests/test_rom_golden_detokenise.py
+tests/test_scanner.py
 tests/test_tokeniser.py
 tests/test_tokens.py

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_cli.py RENAMED Viewed

@@ -176,15 +176,54 @@ class TestTokeniseCommand:
         assert "no line number" in result.output
+class TestTokeniseEncoding:
+    def test_default_decodes_host_utf8_pound_sign(self):
+        # The default encoding is UTF-8, so a £ typed in a host editor
+        # (0xC2 0xA3) maps to the single Acorn code point, not the raw
+        # multibyte sequence.
+        runner = CliRunner()
+        result = runner.invoke(cli, ["tokenise"], input='10 PRINT "£"\n'.encode("utf-8"))
+        assert result.exit_code == 0
+        # Acorn stores £ as 0x60 inside the tokenised string literal.
+        assert b'"\x60"' in result.stdout_bytes
+    def test_acorn_encoding_passes_raw_bytes_through(self):
+        # --encoding acorn treats the input as Acorn bytes already, so a
+        # 0x60 byte is the £ literal verbatim.
+        runner = CliRunner()
+        result = runner.invoke(
+            cli, ["tokenise", "--encoding", "acorn"], input=b'10 PRINT "\x60"\n'
+        )
+        assert result.exit_code == 0
+        assert b'"\x60"' in result.stdout_bytes
 class TestDetokeniseCommand:
     def test_pipe_detokenises_to_stdout(self):
         program = basic.tokenise("10 PRINT")
         runner = CliRunner()
         result = runner.invoke(cli, ["detokenise"], input=program)
         assert result.exit_code == 0
-        # Acorn output: CR line ending.
+        # Default UTF-8 output: host-native LF line ending.
+        assert result.stdout_bytes == b"10 PRINT\n"
+    def test_acorn_encoding_uses_cr_terminators(self):
+        program = basic.tokenise("10 PRINT")
+        runner = CliRunner()
+        result = runner.invoke(cli, ["detokenise", "--encoding", "acorn"], input=program)
+        assert result.exit_code == 0
+        # Acorn output: native CR line ending.
         assert result.stdout_bytes == b"10 PRINT\r"
+    def test_default_writes_host_utf8_pound_sign(self):
+        # A program whose string literal holds the Acorn £ (0x60)
+        # de-tokenises to a UTF-8 £ under the default encoding.
+        program = basic.tokenise('10 PRINT "£"')
+        runner = CliRunner()
+        result = runner.invoke(cli, ["detokenise"], input=program)
+        assert result.exit_code == 0
+        assert "£".encode("utf-8") in result.stdout_bytes
     def test_malformed_program_reports_the_offset(self):
         runner = CliRunner()
         result = runner.invoke(cli, ["detokenise"], input=b"\x99")
@@ -199,4 +238,5 @@ class TestTokeniseDetokeniseRoundTrip:
         assert tokenised.exit_code == 0
         listing = runner.invoke(cli, ["detokenise"], input=tokenised.stdout_bytes)
         assert listing.exit_code == 0
-        assert listing.stdout_bytes == b"10 PRINT\r20 GOTO 10\r"
+        # Default UTF-8 output: host-native LF line endings.
+        assert listing.stdout_bytes == b"10 PRINT\n20 GOTO 10\n"

oaknut_basic-12.9.0/tests/test_scanner.py ADDED Viewed

@@ -0,0 +1,179 @@
+"""Tests for the BBC BASIC II token-stream scanner.
+The scanner is the faithful primitive: where the de-tokeniser flattens a
+body to text (lossy for keyword adjacency), :func:`scan` yields a token
+per keyword, literal run, string, and line-number reference, each with
+its byte offset. The headline case is ``?Q <PLOT> ?Q``, whose text
+``?QPLOT?Q`` re-lexes as the variable ``QPLOT`` — only the stream keeps
+``PLOT`` as a distinct token.
+"""
+import pytest
+from oaknut.basic import (
+    LineRecord,
+    Token,
+    TokenKind,
+    detokenise_body,
+    scan,
+    scan_program,
+)
+from oaknut.basic.exceptions import (
+    MissingLineMarkerError,
+    TruncatedProgramError,
+)
+from oaknut.basic.linenumber import encode_line_number
+from oaknut.basic.tokens import LINE_NUMBER_TOKEN
+def _kinds(tokens):
+    return [token.kind for token in tokens]
+def _values(tokens):
+    return [token.value for token in tokens]
+class TestBodyScan:
+    def test_empty_body_yields_no_tokens(self):
+        assert list(scan(b"")) == []
+    def test_plain_text_is_one_run(self):
+        tokens = list(scan(b"A=1"))
+        assert tokens == [Token(TokenKind.TEXT, "A=1", 0)]
+    def test_keyword_resolves_to_spelling(self):
+        # &F1 is PRINT.
+        tokens = list(scan(b"\xf1"))
+        assert tokens == [Token(TokenKind.KEYWORD, "PRINT", 0, 0xF1)]
+    def test_keyword_splits_surrounding_text(self):
+        # The rheolism case: ?Q <PLOT=&F0> ?Q must keep PLOT distinct.
+        tokens = list(scan(b"?Q\xf0?Q"))
+        assert _kinds(tokens) == [TokenKind.TEXT, TokenKind.KEYWORD, TokenKind.TEXT]
+        assert _values(tokens) == ["?Q", "PLOT", "?Q"]
+        assert tokens[1].token == 0xF0
+        assert [token.start for token in tokens] == [0, 2, 3]
+    def test_string_is_copied_verbatim_through_the_quotes(self):
+        # Token-valued bytes inside a string must not expand.
+        tokens = list(scan(b'"\xf0"'))
+        assert tokens == [Token(TokenKind.STRING, '"\xf0"', 0)]
+    def test_unterminated_string_runs_to_end(self):
+        tokens = list(scan(b'X"ab'))
+        assert _kinds(tokens) == [TokenKind.TEXT, TokenKind.STRING]
+        assert _values(tokens) == ["X", '"ab']
+    def test_line_number_reference_is_decoded(self):
+        body = b"\xe5" + bytes((LINE_NUMBER_TOKEN,)) + encode_line_number(100)
+        tokens = list(scan(body))
+        assert _kinds(tokens) == [TokenKind.KEYWORD, TokenKind.LINENUM]
+        assert tokens[0].value == "GOTO"
+        assert tokens[1] == Token(TokenKind.LINENUM, "100", 1)
+    def test_unknown_high_byte_folds_into_text(self):
+        # &CE is the unused gap: no keyword, but must survive as a byte.
+        tokens = list(scan(b"A\xceB"))
+        assert tokens == [Token(TokenKind.TEXT, "A\xceB", 0)]
+    def test_base_offset_shifts_reported_positions(self):
+        tokens = list(scan(b"\xf0", base_offset=10))
+        assert tokens[0].start == 10
+    def test_truncated_line_number_reference_raises(self):
+        with pytest.raises(TruncatedProgramError):
+            list(scan(bytes((LINE_NUMBER_TOKEN, 0x40, 0x40))))
+class TestDetokeniseBodyProjection:
+    """detokenise_body is exactly the text join of the token stream."""
+    @pytest.mark.parametrize(
+        "body",
+        [
+            b"",
+            b"A=1",
+            b"?Q\xf0?Q",
+            b'PRINT"\xf0\xf1"',
+            b"\xe5" + bytes((LINE_NUMBER_TOKEN,)) + encode_line_number(2000),
+            b"A\xceB",
+            b'X"ab',
+        ],
+    )
+    def test_body_text_is_the_joined_values(self, body):
+        assert detokenise_body(body) == "".join(token.value for token in scan(body))
+    def test_inline_token_body_detokenises_without_framing(self):
+        # The #39 case: a headerless inline-token body the framed
+        # detokenise() would reject at offset 0.
+        assert detokenise_body(b"\xeb\x37") == "MODE7"
+class TestProgramScan:
+    def _program(self, *lines):
+        out = bytearray()
+        for line_number, body in lines:
+            out += bytes((0x0D, (line_number >> 8) & 0xFF, line_number & 0xFF, 4 + len(body)))
+            out += body
+        out += b"\x0d\xff"
+        return bytes(out)
+    def test_walks_records_and_yields_line_numbers(self):
+        program = self._program((10, b"\xeb\x37"), (20, b"?Q\xf0?Q"))
+        records = list(scan_program(program))
+        assert [record.line_number for record in records] == [10, 20]
+        assert all(isinstance(record, LineRecord) for record in records)
+        assert _values(records[1].tokens) == ["?Q", "PLOT", "?Q"]
+    def test_record_start_points_at_the_cr_marker(self):
+        program = self._program((10, b"X"), (20, b"Y"))
+        records = list(scan_program(program))
+        assert records[0].start == 0
+        assert records[1].start == 5
+    def test_empty_program_yields_no_records(self):
+        assert list(scan_program(b"\x0d\xff")) == []
+    def test_missing_line_marker_raises(self):
+        with pytest.raises(MissingLineMarkerError):
+            list(scan_program(b"\xeb"))
+class TestPublicTokenTable:
+    """The byte-level surface a faithful front end consumes (#41)."""
+    def test_table_symbols_are_exported_from_the_package(self):
+        import oaknut.basic as basic
+        for name in (
+            "TOKEN_TO_KEYWORD",
+            "KEYWORDS",
+            "LINE_NUMBER_TOKEN",
+            "decode_line_number",
+            "FLAG_CONDITIONAL",
+            "FLAG_MIDDLE",
+            "FLAG_START",
+            "FLAG_FN_PROC",
+            "FLAG_LINE_NUMBER",
+            "FLAG_STOP_LINE",
+            "FLAG_PSEUDO_VAR",
+        ):
+            assert name in basic.__all__
+            assert hasattr(basic, name)
+    def test_token_to_keyword_resolves_a_keyword(self):
+        from oaknut.basic import TOKEN_TO_KEYWORD
+        assert TOKEN_TO_KEYWORD[0xF1] == "PRINT"
+    def test_token_to_keyword_is_read_only(self):
+        from oaknut.basic import TOKEN_TO_KEYWORD
+        with pytest.raises(TypeError):
+            TOKEN_TO_KEYWORD[0x00] = "OOPS"
+    def test_line_number_helpers_round_trip(self):
+        from oaknut.basic import LINE_NUMBER_TOKEN, decode_line_number
+        assert LINE_NUMBER_TOKEN == 0x8D
+        assert decode_line_number(encode_line_number(1234)) == 1234

oaknut_basic-12.8.2/src/oaknut/basic/detokeniser.py DELETED Viewed

@@ -1,116 +0,0 @@
-"""De-tokenise a stored BBC BASIC II program into source text.
-Mirrors the ROM's ``LIST`` walk: split the program into lines on the
-``&0D`` start-of-line markers, render each line's number followed by its
-de-tokenised body. Within a body, keyword tokens expand to their table
-spelling, ``&8D`` references decode back to plain decimal, and bytes
-inside a quoted string are emitted verbatim (no token expansion).
-The returned string uses latin-1/code-point semantics: a stored byte
-``b`` becomes ``chr(b)``, so the text round-trips byte-exactly through
-:func:`oaknut.basic.tokenise`. Mapping to a host character set (Acorn,
-UTF-8) is the caller's job — the CLI does it at its I/O boundary.
-The line layout (from the disassembly) is::
-    &0D <lineNo-hi> <lineNo-lo> <length> <body...>   ...   &0D &FF
-where *length* counts the whole record from this ``&0D`` up to (not
-including) the next one, so ``length = 4 + len(body)``.
-"""
-from __future__ import annotations
-from oaknut.basic.exceptions import (
-    InvalidLineLengthError,
-    MissingLineMarkerError,
-    TruncatedProgramError,
-)
-from oaknut.basic.linenumber import decode_line_number
-from oaknut.basic.tokens import HEADER_LENGTH, LINE_NUMBER_TOKEN, TOKEN_TO_KEYWORD
-_CR = 0x0D
-_END_MARKER = 0xFF
-_QUOTE = 0x22
-def detokenise(data: bytes) -> str:
-    """De-tokenise a BBC BASIC II program into source text.
-    Args:
-        data: A tokenised program, as stored on disc or in memory,
-            terminated by the ``&0D &FF`` end marker.
-    Returns:
-        The program as numbered source text, lines separated by ``"\\n"``
-        with a trailing newline. Empty for an empty program.
-    Raises:
-        DetokeniseError: The token stream is malformed — a missing line
-            marker, a truncated header or reference, or an impossible
-            length byte.
-    """
-    lines: list[str] = []
-    i = 0
-    n = len(data)
-    while i < n:
-        if data[i] != _CR:
-            raise MissingLineMarkerError(offset=i, found=data[i])
-        if i + 1 >= n:
-            raise TruncatedProgramError(offset=i, detail="line marker with no line number")
-        if data[i + 1] == _END_MARKER:
-            break
-        if i + HEADER_LENGTH > n:
-            raise TruncatedProgramError(offset=i, detail="incomplete line header")
-        line_number = (data[i + 1] << 8) | data[i + 2]
-        length = data[i + 3]
-        if length < HEADER_LENGTH or i + length > n:
-            raise InvalidLineLengthError(offset=i, length=length)
-        body = data[i + HEADER_LENGTH : i + length]
-        lines.append(f"{line_number}{_detokenise_body(body, base_offset=i + HEADER_LENGTH)}")
-        i += length
-    return "".join(f"{line}\n" for line in lines)
-def _detokenise_body(body: bytes, *, base_offset: int) -> str:
-    """Expand one line's body bytes to text.
-    *base_offset* is the absolute offset of ``body[0]`` within the whole
-    program, so any fault can be reported at its true location.
-    """
-    out: list[str] = []
-    i = 0
-    n = len(body)
-    in_string = False
-    while i < n:
-        b = body[i]
-        if in_string:
-            out.append(chr(b))
-            if b == _QUOTE:
-                in_string = False
-            i += 1
-            continue
-        if b == _QUOTE:
-            in_string = True
-            out.append('"')
-            i += 1
-            continue
-        if b == LINE_NUMBER_TOKEN:
-            payload = body[i + 1 : i + 4]
-            if len(payload) != 3:
-                raise TruncatedProgramError(
-                    offset=base_offset + i,
-                    detail="incomplete &8D line-number reference",
-                )
-            out.append(str(decode_line_number(payload)))
-            i += 4
-            continue
-        if b >= 0x80:
-            # Unknown high bytes (e.g. the &CE gap) fall back to a raw
-            # character so the bytes still round-trip.
-            out.append(TOKEN_TO_KEYWORD.get(b, chr(b)))
-            i += 1
-            continue
-        out.append(chr(b))
-        i += 1
-    return "".join(out)

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/LICENSE RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/README.md RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/pyproject.toml RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/setup.cfg RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut/basic/datafile.py RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut/basic/exceptions.py RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut/basic/float5.py RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut/basic/linenumber.py RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut/basic/numbering.py RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut/basic/tokeniser.py RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut_basic.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut_basic.egg-info/entry_points.txt RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut_basic.egg-info/requires.txt RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut_basic.egg-info/top_level.txt RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_basic.py RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_crunch_rules.py RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_data_cli.py RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_datafile.py RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_detokeniser.py RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_float5.py RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_keyword_coverage.py RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_linenumber.py RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_numbering.py RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_rom_golden.py RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_rom_golden_detokenise.py RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_tokeniser.py RENAMED Viewed

File without changes

{oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_tokens.py RENAMED Viewed

File without changes

oaknut-basic 12.8.2__tar.gz → 12.9.0__tar.gz

oaknut-basic 12.8.2tar.gz → 12.9.0tar.gz