PyPI - pagefind - Versions diffs - 1.2.0__tar.gz - Mend

pagefind 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

pagefind-1.2.0/PKG-INFO +98 -0
pagefind-1.2.0/README.md +75 -0
pagefind-1.2.0/pyproject.toml +67 -0
pagefind-1.2.0/src/pagefind/__init__.py +18 -0
pagefind-1.2.0/src/pagefind/__main__.py +13 -0
pagefind-1.2.0/src/pagefind/index/__init__.py +274 -0
pagefind-1.2.0/src/pagefind/py.typed +0 -0
pagefind-1.2.0/src/pagefind/service/__init__.py +236 -0
pagefind-1.2.0/src/pagefind/service/types.py +168 -0

pagefind-1.2.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,98 @@
+Metadata-Version: 2.1
+Name: pagefind
+Version: 1.2.0
+Summary: Python API for Pagefind
+License: MIT
+Author: CloudCannon
+Requires-Python: >=3.9
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Text Processing :: Indexing
+Classifier: Topic :: Text Processing :: Markup :: HTML
+Provides-Extra: bin
+Provides-Extra: extended
+Requires-Dist: pagefind_bin (>=1.2.0,<1.3.0) ; extra == "bin"
+Requires-Dist: pagefind_bin_extended (>=1.2.0,<1.3.0) ; extra == "extended"
+Description-Content-Type: text/markdown
+# `pagefind`
+An async python API for the [pagefind](https://pagefind.app) binary.
+## Installation
+```sh
+python3 -m pip install 'pagefind[bin]'
+python3 -m pagefind --help
+```
+## Usage
+<!--[[[cog
+  print("```py")
+  print(open('./src/tests/integration.py').read())
+  print("```")
+]]] -->
+```py
+import asyncio
+import json
+import logging
+import os
+from pagefind.index import PagefindIndex, IndexConfig
+logging.basicConfig(level=os.environ.get("LOG_LEVEL", "INFO"))
+log = logging.getLogger(__name__)
+html_content = (
+    "<html>"
+    "  <body>"
+    "    <main>"
+    "      <h1>Example HTML</h1>"
+    "      <p>This is an example HTML page.</p>"
+    "    </main>"
+    "  </body>"
+    "</html>"
+)
+def prefix(pre: str, s: str) -> str:
+    return pre + s.replace("\n", f"\n{pre}")
+async def main():
+    config = IndexConfig(
+        root_selector="main", logfile="index.log", output_path="./output", verbose=True
+    )
+    async with PagefindIndex(config=config) as index:
+        log.debug("opened index")
+        new_file, new_record, new_dir = await asyncio.gather(
+            index.add_html_file(
+                content=html_content,
+                url="https://example.com",
+                source_path="other/example.html",
+            ),
+            index.add_custom_record(
+                url="/elephants/",
+                content="Some testing content regarding elephants",
+                language="en",
+                meta={"title": "Elephants"},
+            ),
+            index.add_directory("./public"),
+        )
+        print(prefix("new_file    ", json.dumps(new_file, indent=2)))
+        print(prefix("new_record  ", json.dumps(new_record, indent=2)))
+        print(prefix("new_dir     ", json.dumps(new_dir, indent=2)))
+        files = await index.get_files()
+        for file in files:
+            print(prefix("files", f"{len(file['content']):10}B {file['path']}"))
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+<!-- [[[end]]] -->

pagefind-1.2.0/README.md ADDED Viewed

@@ -0,0 +1,75 @@
+# `pagefind`
+An async python API for the [pagefind](https://pagefind.app) binary.
+## Installation
+```sh
+python3 -m pip install 'pagefind[bin]'
+python3 -m pagefind --help
+```
+## Usage
+<!--[[[cog
+  print("```py")
+  print(open('./src/tests/integration.py').read())
+  print("```")
+]]] -->
+```py
+import asyncio
+import json
+import logging
+import os
+from pagefind.index import PagefindIndex, IndexConfig
+logging.basicConfig(level=os.environ.get("LOG_LEVEL", "INFO"))
+log = logging.getLogger(__name__)
+html_content = (
+    "<html>"
+    "  <body>"
+    "    <main>"
+    "      <h1>Example HTML</h1>"
+    "      <p>This is an example HTML page.</p>"
+    "    </main>"
+    "  </body>"
+    "</html>"
+)
+def prefix(pre: str, s: str) -> str:
+    return pre + s.replace("\n", f"\n{pre}")
+async def main():
+    config = IndexConfig(
+        root_selector="main", logfile="index.log", output_path="./output", verbose=True
+    )
+    async with PagefindIndex(config=config) as index:
+        log.debug("opened index")
+        new_file, new_record, new_dir = await asyncio.gather(
+            index.add_html_file(
+                content=html_content,
+                url="https://example.com",
+                source_path="other/example.html",
+            ),
+            index.add_custom_record(
+                url="/elephants/",
+                content="Some testing content regarding elephants",
+                language="en",
+                meta={"title": "Elephants"},
+            ),
+            index.add_directory("./public"),
+        )
+        print(prefix("new_file    ", json.dumps(new_file, indent=2)))
+        print(prefix("new_record  ", json.dumps(new_record, indent=2)))
+        print(prefix("new_dir     ", json.dumps(new_dir, indent=2)))
+        files = await index.get_files()
+        for file in files:
+            print(prefix("files", f"{len(file['content']):10}B {file['path']}"))
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+<!-- [[[end]]] -->

pagefind-1.2.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,67 @@
+[tool.poetry]
+name = "pagefind"
+version = "1.2.0"
+# note this^^^^^^^ version will be replaced by scripts/build/api_package.py
+description = "Python API for Pagefind"
+authors = ["CloudCannon"]
+license = "MIT"
+readme = "README.md"
+include = []
+exclude = [
+    "dist",
+    "output",
+    "*.whl",
+    "*.egg-info",
+    "*.log",
+    ".venv",
+    "pagefind_python_bin" # poetry has a *.pth file in its .venv that causes
+    # directories in src/ to be preferentially imported. To allow testing
+    # `import pagefind_bin`, we use ./src/pagefind_python_bin as a workaround.
+]
+classifiers = [
+    "License :: OSI Approved :: MIT License",
+    "Topic :: Text Processing :: Indexing",
+    "Topic :: Text Processing :: Markup :: HTML",
+]
+# Note: we *aren't* including an `entry-points` section here to avoid clobbering
+# the user's natively-installed `pagefind` binary. Using `python3 -m pagefind`
+# is an informatively-namespaced alternative that doesn't add too many keystrokes.
+# See https://packaging.python.org/en/latest/specifications/entry-points/
+[tool.poetry.dependencies]
+python = ">=3.9"
+# during the building of the `pagefind` python package, the pagefind binary packages
+# aren't yet published. Thus, `poetry lock` will fail if we include them here.
+# However, `poetry build` fails to include the binary package extras in
+# `pagefind`'s distribution info if these lines are commented out. Thus,
+# we temporarily uncomment these lines during the build process, and then re-comment
+# them afterwards
+# these next two lines are owned by ./scripts/build/api_package.py
+pagefind_bin = { version = "~=1.2.0", optional = true }
+pagefind_bin_extended = { version = "~=1.2.0", optional = true }
+[tool.poetry.extras]
+bin = ["pagefind_bin"]
+extended = ["pagefind_bin_extended"]
+[tool.poetry.group.dev.dependencies]
+ruff = "^0.5.0"
+mypy = "^1.10.1"
+wheel = "^0.43.0"
+cogapp = "^3.4.1"
+twine = "^5.1.1"
+docutils = "^0.21.2"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+# note that poetry can currently only build `purelib`s, or pure-python wheels.
+# (see https://python-poetry.org/docs/cli#build)
+# This means poetry can't handle building wheels that contain pagefind's binaries,
+#  which are necessarily platform-dependent.
+# For more information on purelibs/pure-python wheels, see
+# https://peps.python.org/pep-0427/#what-s-the-deal-with-purelib-vs-platlib

pagefind-1.2.0/src/pagefind/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+# assume the python version is >= 3.9, which is the oldest LTS version with
+# more 2 months of life as of the time of writing, 2024-08-18
+# https://docs.python.org/3/reference/datamodel.html#async-context-managers
+# https://docs.python.org/3/library/contextlib.html#contextlib.asynccontextmanager
+# [[[cog
+# import tomllib # ok since the development environment must be python >= 3.11
+# from pathlib import Path
+# pyproject = Path("pyproject.toml") # note the CWD is the project root
+# assert pyproject.is_file(), f"expected {pyproject.absolute()} to be a file"
+# version = tomllib.load(pyproject.open("rb"))["tool"]["poetry"]["version"]
+# print(f'__version__ = "{version}"')
+# ]]]
+__version__ = "0.0.0a0"
+# [[[end]]]

pagefind-1.2.0/src/pagefind/__main__.py ADDED Viewed

@@ -0,0 +1,13 @@
+import os
+import sys
+from .service import _must_get_executable
+bin = str(_must_get_executable().resolve().absolute())
+argv = [bin, *sys.argv[1:]]
+if os.name == "posix":
+    os.execv(bin, argv)
+else:
+    import subprocess
+    sys.exit(subprocess.call(argv))

pagefind-1.2.0/src/pagefind/index/__init__.py ADDED Viewed

@@ -0,0 +1,274 @@
+import logging
+import base64
+from typing import Any, Dict, List, Optional, Sequence, TypedDict, cast
+from ..service import PagefindService
+from ..service.types import (
+    InternalAddDirRequest,
+    InternalAddFileRequest,
+    InternalAddRecordRequest,
+    InternalDeleteIndexRequest,
+    InternalGetFilesRequest,
+    InternalGetFilesResponse,
+    InternalIndexedDirResponse,
+    InternalIndexedFileResponse,
+    InternalDecodedFile,
+    InternalWriteFilesRequest,
+)
+log = logging.getLogger(__name__)
+class IndexConfig(TypedDict, total=False):
+    root_selector: Optional[str]
+    """
+    The root selector to use for the index.
+    If not supplied, Pagefind will use the ``<html>`` tag.
+    """
+    exclude_selectors: Optional[Sequence[str]]
+    """Extra element selectors that Pagefind should ignore when indexing."""
+    force_language: Optional[str]
+    """
+    Ignores any detected languages and creates a single index for the entire site as the
+    provided language. Expects an ISO 639-1 code, such as ``en`` or ``pt``.
+    """
+    verbose: Optional[bool]
+    """
+    Prints extra logging while indexing the site. Only affects the CLI, does not impact
+    web-facing search.
+    """
+    logfile: Optional[str]
+    """
+    A path to a file to log indexing output to in addition to stdout.
+    The file will be created if it doesn't exist and overwritten on each run.
+    """
+    keep_index_url: Optional[bool]
+    """Whether to keep ``index.html`` at the end of search result paths.
+    By default, a file at ``animals/cat/index.html`` will be given the URL
+    ``/animals/cat/``. Setting this option to ``true`` will result in the URL
+    ``/animals/cat/index.html``.
+    """
+    output_path: Optional[str]
+    """
+    The folder to output the search bundle into, relative to the processed site.
+    Defaults to ``pagefind``.
+    """
+class PagefindIndex:
+    """Manages a Pagefind index.
+    ``PagefindIndex`` operates as an async contextmanager.
+    Entering the context starts a backing Pagefind service and creates an in-memory index in the backing service.
+    Exiting the context writes the in-memory index to disk and then shuts down the backing Pagefind service.
+    Each method of ``PagefindIndex`` that talks to the backing Pagefind service can raise errors.
+    If an exception is is rased inside ``PagefindIndex``'s context, the context closes without writing the index files to disk.
+    ``PagefindIndex`` optionally takes a configuration dictionary that can apply parts of the [Pagefind CLI config](/docs/config-options/). The options available at this level are:
+    See the relevant documentation for these configuration options in the
+    `Configuring the Pagefind CLI <https://pagefind.app/docs/config-options/>` documentation.
+    """
+    _service: Optional["PagefindService"] = None
+    _index_id: Optional[int] = None
+    _config: Optional[IndexConfig] = None
+    """Note that config should be immutable."""
+    def __init__(
+        self,
+        config: Optional[IndexConfig] = None,
+        *,
+        _service: Optional["PagefindService"] = None,
+        _index_id: Optional[int] = None,
+    ):
+        self._service = _service
+        self._index_id = _index_id
+        self._config = config
+    async def _start(self) -> "PagefindIndex":
+        """Start the backing Pagefind service and create an in-memory index."""
+        assert self._index_id is None
+        assert self._service is None
+        self._service = await PagefindService().launch()
+        _index = await self._service.create_index(self._config)
+        self._index_id = _index._index_id
+        return self
+    async def add_html_file(
+        self,
+        *,
+        content: str,
+        source_path: Optional[str] = None,
+        url: Optional[str] = None,
+    ) -> InternalIndexedFileResponse:
+        """Add an HTML file to the index.
+        :param content: The source HTML content of the file to be parsed.
+        :param source_path: The source path of the HTML file would have on disk. \
+            Must be a relative path, or an absolute path within the current working directory. \
+            Pagefind will compute the result URL from this path.
+        :param url: an explicit URL to use, instead of having Pagefind compute the \
+            URL based on the source_path. If not supplied, source_path must be supplied.
+        """
+        assert self._service is not None
+        assert self._index_id is not None
+        result = await self._service.send(
+            InternalAddFileRequest(
+                type="AddFile",
+                index_id=self._index_id,
+                url=url,
+                file_contents=content,
+                file_path=source_path,
+            )
+        )
+        assert result["type"] == "IndexedFile"
+        return cast(InternalIndexedFileResponse, result)
+    async def add_directory(
+        self, path: str, *, glob: Optional[str] = None
+    ) -> InternalIndexedDirResponse:
+        """Indexes a directory from disk using the standard Pagefind indexing behaviour.
+        This is equivalent to running the Pagefind binary with ``--site <dir>``.
+        :param path: the path to the directory to index. If the `path` provided is relative, \
+                it will be relative to the current working directory of your Python process.
+        :param glob: a glob pattern to filter files in the directory. If not provided, all \
+            files matching ``**.{html}`` are indexed. For more information on glob patterns, \
+            see the `Wax patterns documentation <https://github.com/olson-sean-k/wax#patterns>`.
+        """
+        assert self._service is not None
+        assert self._index_id is not None
+        result = await self._service.send(
+            InternalAddDirRequest(
+                type="AddDir",
+                index_id=self._index_id,
+                path=path,
+                glob=glob,
+            )
+        )
+        assert result["type"] == "IndexedDir"
+        return cast(InternalIndexedDirResponse, result)
+    async def get_files(self) -> List[InternalDecodedFile]:
+        """Get raw data of all files in the Pagefind index.
+        WATCH OUT: this method emits all files. This can be a lot of data, and
+        this amount of data can cause reading from the subprocess pipes to deadlock.
+        STRICTLY PREFER calling ``self.write_files()``.
+        """
+        assert self._service is not None
+        assert self._index_id is not None
+        response = await self._service.send(
+            InternalGetFilesRequest(type="GetFiles", index_id=self._index_id)
+        )
+        assert response["type"] == "GetFiles"
+        files = cast(InternalGetFilesResponse, response)["files"]
+        decoded_files = [
+            {"path": file["path"], "content": base64.b64decode(file["content"])}
+            for file in files
+        ]
+        return cast(List[InternalDecodedFile], decoded_files)
+    async def delete_index(self) -> None:
+        """
+        Deletes the data for the given index from its backing Pagefind service.
+        Doesn't affect any written files or data returned by ``get_files()``.
+        """
+        assert self._service is not None
+        assert self._index_id is not None
+        result = await self._service.send(
+            InternalDeleteIndexRequest(type="DeleteIndex", index_id=self._index_id)
+        )
+        assert result["type"] == "DeleteIndex"
+        self._index_id = None
+        self._service = None
+    async def add_custom_record(
+        self,
+        *,
+        url: str,
+        content: str,
+        language: str,
+        meta: Optional[Dict[str, str]] = None,
+        filters: Optional[Dict[str, List[str]]] = None,
+        sort: Optional[Dict[str, str]] = None,
+    ) -> InternalIndexedFileResponse:
+        """Add a direct record to the Pagefind index.
+        This method is useful for adding non-HTML content to the search results.
+        :param content: the raw content of this record.
+        :param url: the output URL of this record. Pagefind will not alter this.
+        :param language: ISO 639-1 code of the language this record is written in.
+        :param meta: the metadata to attach to this record. Supplying a ``title`` is highly recommended.
+        :param filters: the filters to attach to this record. Filters are used to group records together.
+        :param sort: the sort keys to attach to this record.
+        """
+        assert self._service is not None
+        assert self._index_id is not None
+        result = await self._service.send(
+            InternalAddRecordRequest(
+                type="AddRecord",
+                index_id=self._index_id,
+                url=url,
+                content=content,
+                language=language,
+                meta=meta,
+                filters=filters,
+                sort=sort,
+            )
+        )
+        assert result["type"] == "IndexedFile"
+        return cast(InternalIndexedFileResponse, result)
+    async def write_files(self, output_path: Optional[str] = None) -> None:
+        """Write the index files to disk.
+        If you're using PagefindIndex as a context manager, there's no need to call this method:
+        if no error occurred, closing the context automatically writes the index files to disk.
+        :param output_path: a path to override the configured output path for the index.
+        """
+        assert self._service is not None
+        assert self._index_id is not None
+        if not output_path:
+            if not self._config:
+                output_path = None
+            else:
+                output_path = self._config.get("output_path")
+        result = await self._service.send(
+            InternalWriteFilesRequest(
+                type="WriteFiles",
+                index_id=self._index_id,
+                output_path=output_path,
+            )
+        )
+        assert result["type"] == "WriteFiles"
+    async def __aenter__(self) -> "PagefindIndex":
+        assert self._service is None
+        assert self._index_id is None
+        return await self._start()
+    async def __aexit__(
+        self,
+        exc_type: Optional[Any],
+        exc_value: Optional[Any],
+        traceback: Optional[Any],
+    ) -> None:
+        if self._service is None:
+            return
+        if self._index_id is None:
+            return
+        if exc_type is None:
+            await self.write_files()
+        await self._service.close()

pagefind-1.2.0/src/pagefind/py.typed ADDED Viewed

File without changes

pagefind-1.2.0/src/pagefind/service/__init__.py ADDED Viewed

@@ -0,0 +1,236 @@
+import asyncio
+import base64
+import json
+import logging
+import os
+import shutil
+from contextlib import AbstractAsyncContextManager
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast
+from .types import (
+    InternalNewIndexRequest,
+    InternalNewIndexResponse,
+    InternalRequestPayload,
+    InternalResponseError,
+    InternalResponsePayload,
+    InternalResponseType,
+    InternalServiceRequest,
+    InternalServiceResponse,
+    InternalSyntheticFile,
+)
+if TYPE_CHECKING:
+    from ..index import IndexConfig, PagefindIndex
+log = logging.getLogger(__name__)
+__all__ = ["PagefindService", "get_executable"]
+def get_executable() -> Optional[Path]:
+    env_bin_path = os.getenv("PAGEFIND_BINARY_PATH")
+    if env_bin_path is not None:
+        log.debug(f"using {env_bin_path}")
+        return Path(env_bin_path)
+    try:
+        from pagefind_bin_extended import get_executable  # type: ignore
+        extended: Path = get_executable()
+        log.debug(f"using {extended}")
+        return extended
+    except ImportError:
+        log.debug("unable to import pagefind_bin_extended")
+    try:
+        from pagefind_bin import get_executable  # type: ignore
+        bin: Path = get_executable()
+        log.debug(f"using {bin}")
+        return bin
+    except ImportError:
+        log.debug("unable to import pagefind_bin")
+    external: Optional[str] = shutil.which("pagefind_extended")
+    external = external or shutil.which("pagefind")
+    if external is None:
+        log.debug("Could not find externally-installed pagefind binary")
+        return None
+    else:
+        log.debug(f"using {external}")
+        return Path(external)
+def _must_get_executable() -> Path:
+    if (bin := get_executable()) is None:
+        raise FileNotFoundError("Could not find pagefind binary")
+    return bin
+def _encode(req: InternalServiceRequest) -> bytes:
+    return base64.b64encode(json.dumps(req).encode("utf-8"))
+class PagefindService(AbstractAsyncContextManager["PagefindService"]):
+    _bin: Path
+    _backend: asyncio.subprocess.Process
+    _message_id: int = 0
+    _responses: Dict[int, asyncio.Future[InternalResponsePayload]]
+    _loop: asyncio.AbstractEventLoop
+    _poll_task: asyncio.Task[None]
+    # _messages
+    def __init__(self) -> None:
+        self._loop = asyncio.get_event_loop()
+        self._bin = _must_get_executable()
+        self._responses = dict()
+    async def launch(self) -> "PagefindService":
+        log.debug(f"launching {self._bin}")
+        # TODO: detach process on windows?
+        # creation_flags: int = 0
+        # if platform.system().lower() == "windows":
+        #     creation_flags = subprocess.CREATE_NO_WINDOW | subprocess.CREATE_DETACHED
+        self._backend = await asyncio.create_subprocess_exec(
+            self._bin,
+            "--service",
+            # "--verbose", # <- verbose emits debug logs to stdout, which is also used for IPC
+            cwd=os.getcwd(),
+            stdin=asyncio.subprocess.PIPE,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.STDOUT,
+            limit=2**21,  # <- 2MiB
+            # anything less and the _wait_for_responses loop will hang
+            # due to the stdout pipes deadlocking due to the buffer filling up
+        )
+        log.debug(f"launched {self._bin}: {self._backend}.")
+        log.debug("polling for responses")
+        self._poll_task = self._loop.create_task(self._wait_for_responses())
+        log.debug(f"polling task created: {self._poll_task}")
+        return self
+    async def send(self, payload: InternalRequestPayload) -> InternalResponsePayload:
+        self._message_id += 1
+        message_id = self._message_id
+        if (_ := self._responses.get(message_id)) is not None:
+            raise KeyError(f"message_id {message_id} already in use")
+        else:
+            future: asyncio.Future[InternalResponsePayload] = self._loop.create_future()
+            self._responses[message_id] = future
+        # FIXME: check stdin not none?
+        if self._backend.stdin is None:
+            # restart the backend
+            log.debug("restarting backend")
+            await self.launch()
+            log.debug("backend restarted")
+        assert self._backend.stdin is not None
+        req = InternalServiceRequest(message_id=message_id, payload=payload)
+        log.debug(f"sending request: {req}")
+        self._backend.stdin.write(_encode(req) + b",")
+        # backend waits for a comma before responding
+        await self._backend.stdin.drain()
+        log.debug(f"request sent: {req}")
+        result = await future
+        if result["type"] == InternalResponseType.GET_FILES.value:  # these are HUGE
+            if (files := result.get("files")) is not None:
+                files = cast(List[InternalSyntheticFile], files)
+                base64_ch = sum(len(file["content"]) for file in files)
+                log.debug(f"received response: <{len(files)} files, {base64_ch} chars>")
+        else:
+            log.debug(f"received response: {result}")
+        return result
+    async def _wait_for_responses(self) -> None:
+        """
+        Poll the subprocess's stdout for responses
+        """
+        while True:
+            await asyncio.sleep(0.1)
+            assert self._backend.stdout is not None
+            log.debug("checking for data")
+            output = await self._backend.stdout.readuntil(b",")
+            if len(output) <= 200:
+                log.debug(f"received data: {output!r}")
+            else:
+                log.debug(
+                    f"received data: {output[:30]!r}...{len(output) - 40}B...{output[-10:]!r}"
+                )
+            if (resp := json.loads(base64.b64decode(output[:-1]))) is None:
+                continue
+            resp = cast(InternalServiceResponse, resp)
+            message_id = resp.get("message_id")
+            if message_id is None:
+                # If the backend service failed to parse the message, it won't return the ID
+                # However it does return the message itself, so we can retrieve the ID we sent
+                if (orginal := resp["payload"].get("original_message")) is not None:
+                    if (sent := json.loads(orginal)) is not None:
+                        message_id = sent.get("message_id")
+            if message_id is not None:
+                log.debug(f"received response for message {message_id}")
+                assert (
+                    self._message_id >= message_id
+                ), f"message_id out of order: incoming {message_id} > current: {self._message_id}"
+                if (future := self._responses.get(message_id)) is not None:
+                    log.debug(f"resolving future for message {message_id}")
+                    payload = resp["payload"]
+                    if payload["type"] == InternalResponseType.ERROR.value:
+                        exc = cast(InternalResponseError, payload)
+                        future.set_exception(
+                            Exception(exc["message"], exc.get("original_message"))
+                        )
+                    else:
+                        future.set_result(cast(InternalResponsePayload, payload))
+                else:
+                    log.debug(f"no receiving future for message {message_id}")
+                    # FIXME: figure out how to surface the error
+                    payload = cast(InternalResponseError, resp["payload"])
+                    # assert (
+                    #     payload["type"] == InternalResponseType.ERROR.value
+                    # ), f"unexpected message type: {payload['type']}"
+    async def close(self) -> None:
+        # wait for all _responses to be resolved
+        log.debug("waiting for all responses to be resolved")
+        try:
+            # wait at most 5s for all responses to be resolved
+            async with asyncio.timeout(5):
+                await asyncio.gather(*self._responses.values())
+                log.debug("all responses resolved")
+        except asyncio.TimeoutError:
+            log.error("timed out waiting for responses to be resolved")
+        self._poll_task.cancel()
+        self._backend.terminate()
+        await self._backend.wait()
+        log.debug("backend terminated")
+    async def __aenter__(self) -> "PagefindService":
+        return await self.launch()
+    async def __aexit__(
+        self,
+        exc_type: Optional[Any],
+        exc_value: Optional[Any],
+        traceback: Optional[Any],
+    ) -> None:
+        await self.close()
+    async def create_index(
+        self, config: Optional["IndexConfig"] = None
+    ) -> "PagefindIndex":
+        from ..index import PagefindIndex
+        _config: Optional["IndexConfig"] = None
+        if config is not None:
+            _config = {**config}  # clone the config to avoid modifying the original
+            _config.pop("output_path", None)
+        log.debug(f"creating index with config: {_config}")
+        result = await self.send(
+            InternalNewIndexRequest(type="NewIndex", config=_config)
+        )
+        log.debug(f"received response: {result}")
+        assert result["type"] == "NewIndex"
+        result = cast(InternalNewIndexResponse, result)
+        return PagefindIndex(config=config, _service=self, _index_id=result["index_id"])

pagefind-1.2.0/src/pagefind/service/types.py ADDED Viewed

@@ -0,0 +1,168 @@
+from enum import Enum
+from typing import Dict, List, Literal, Optional, Sequence, TypedDict, Union
+class InternalRequestType(Enum):
+    NEW_INDEX = "NewIndex"
+    ADD_FILE = "AddFile"
+    ADD_RECORD = "AddRecord"
+    ADD_DIR = "AddDir"
+    WRITE_FILES = "WriteFiles"
+    GET_FILES = "GetFiles"
+    DELETE_INDEX = "DeleteIndex"
+class InternalPagefindServiceConfig(TypedDict, total=False):
+    # FIXME: document
+    root_selector: Optional[str]
+    exclude_selectors: Optional[Sequence[str]]
+    force_language: Optional[str]
+    verbose: Optional[bool]
+    logfile: Optional[str]
+    keep_index_url: Optional[bool]
+class InternalNewIndexRequest(TypedDict):
+    type: Literal["NewIndex"]
+    config: Optional[InternalPagefindServiceConfig]
+class InternalAddFileRequest(TypedDict):
+    type: Literal["AddFile"]
+    index_id: int
+    """index_id must be positive."""
+    file_path: Optional[str]
+    url: Optional[str]
+    file_contents: str
+class InternalAddRecordRequest(TypedDict):
+    type: Literal["AddRecord"]
+    index_id: int
+    """index_id must be positive."""
+    url: str
+    content: str
+    language: str
+    meta: Optional[Dict[str, str]]
+    filters: Optional[Dict[str, List[str]]]
+    sort: Optional[Dict[str, str]]
+class InternalAddDirRequest(TypedDict, total=False):
+    type: Literal["AddDir"]
+    index_id: int
+    path: str  # TODO: support Path
+    glob: Optional[str]
+class InternalWriteFilesRequest(TypedDict, total=False):
+    type: Literal["WriteFiles"]
+    index_id: int
+    """index_id must be positive."""
+    output_path: Optional[str]
+class InternalGetFilesRequest(TypedDict):
+    type: Literal["GetFiles"]
+    index_id: int
+    """index_id must be positive."""
+class InternalDeleteIndexRequest(TypedDict):
+    type: Literal["DeleteIndex"]
+    index_id: int
+    """index_id must be positive."""
+InternalRequestPayload = Union[
+    InternalNewIndexRequest,
+    InternalAddFileRequest,
+    InternalAddRecordRequest,
+    InternalAddDirRequest,
+    InternalWriteFilesRequest,
+    InternalGetFilesRequest,
+    InternalDeleteIndexRequest,
+]
+class InternalServiceRequest(TypedDict):
+    message_id: Optional[int]
+    payload: InternalRequestPayload
+class InternalResponseType(Enum):
+    NEW_INDEX = "NewIndex"
+    INDEXED_FILE = "IndexedFile"
+    INDEXED_DIR = "IndexedDir"
+    WRITE_FILES = "WriteFiles"
+    GET_FILES = "GetFiles"
+    DELETE_INDEX = "DeleteIndex"
+    ERROR = "Error"
+class InternalResponseError(TypedDict):
+    type: Literal["Error"]
+    message: str
+    original_message: Optional[str]
+class InternalNewIndexResponse(TypedDict):
+    type: Literal["NewIndex"]
+    index_id: int
+class InternalIndexedFileResponse(TypedDict):
+    type: Literal["IndexedFile"]
+    page_word_count: int
+    page_url: str
+    page_meta: Dict[str, str]
+class InternalIndexedDirResponse(TypedDict):
+    type: str
+    page_count: int
+class InternalWriteFilesResponse(TypedDict):
+    type: Literal["IndexedFile"]
+    output_path: str
+class InternalSyntheticFile(TypedDict):
+    path: str
+    content: str
+class InternalDecodedFile(TypedDict):
+    path: str
+    content: bytes
+class InternalGetFilesResponse(TypedDict):
+    type: Literal["GetFiles"]
+    files: List[InternalSyntheticFile]
+class InternalDeleteIndexResponse(TypedDict):
+    type: Literal["DeleteIndex"]
+InternalResponsePayload = Union[
+    InternalNewIndexResponse,
+    InternalIndexedFileResponse,
+    InternalIndexedDirResponse,
+    InternalWriteFilesResponse,
+    InternalGetFilesResponse,
+    InternalDeleteIndexResponse,
+]
+class InternalServiceResponse(TypedDict):
+    message_id: Optional[int]
+    payload: Union[InternalResponsePayload, InternalResponseError]
+class InternalResponseCallback(TypedDict, total=False):
+    exception: Optional[Exception]
+    err: Optional[InternalResponseError]
+    result: Optional[InternalResponsePayload]