pagefind 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,98 @@
1
+ Metadata-Version: 2.1
2
+ Name: pagefind
3
+ Version: 1.2.0
4
+ Summary: Python API for Pagefind
5
+ License: MIT
6
+ Author: CloudCannon
7
+ Requires-Python: >=3.9
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.9
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Topic :: Text Processing :: Indexing
16
+ Classifier: Topic :: Text Processing :: Markup :: HTML
17
+ Provides-Extra: bin
18
+ Provides-Extra: extended
19
+ Requires-Dist: pagefind_bin (>=1.2.0,<1.3.0) ; extra == "bin"
20
+ Requires-Dist: pagefind_bin_extended (>=1.2.0,<1.3.0) ; extra == "extended"
21
+ Description-Content-Type: text/markdown
22
+
23
+ # `pagefind`
24
+ An async python API for the [pagefind](https://pagefind.app) binary.
25
+
26
+ ## Installation
27
+
28
+ ```sh
29
+ python3 -m pip install 'pagefind[bin]'
30
+ python3 -m pagefind --help
31
+ ```
32
+
33
+ ## Usage
34
+ <!--[[[cog
35
+ print("```py")
36
+ print(open('./src/tests/integration.py').read())
37
+ print("```")
38
+ ]]] -->
39
+ ```py
40
+ import asyncio
41
+ import json
42
+ import logging
43
+ import os
44
+ from pagefind.index import PagefindIndex, IndexConfig
45
+
46
+ logging.basicConfig(level=os.environ.get("LOG_LEVEL", "INFO"))
47
+ log = logging.getLogger(__name__)
48
+ html_content = (
49
+ "<html>"
50
+ " <body>"
51
+ " <main>"
52
+ " <h1>Example HTML</h1>"
53
+ " <p>This is an example HTML page.</p>"
54
+ " </main>"
55
+ " </body>"
56
+ "</html>"
57
+ )
58
+
59
+
60
+ def prefix(pre: str, s: str) -> str:
61
+ return pre + s.replace("\n", f"\n{pre}")
62
+
63
+
64
+ async def main():
65
+ config = IndexConfig(
66
+ root_selector="main", logfile="index.log", output_path="./output", verbose=True
67
+ )
68
+ async with PagefindIndex(config=config) as index:
69
+ log.debug("opened index")
70
+ new_file, new_record, new_dir = await asyncio.gather(
71
+ index.add_html_file(
72
+ content=html_content,
73
+ url="https://example.com",
74
+ source_path="other/example.html",
75
+ ),
76
+ index.add_custom_record(
77
+ url="/elephants/",
78
+ content="Some testing content regarding elephants",
79
+ language="en",
80
+ meta={"title": "Elephants"},
81
+ ),
82
+ index.add_directory("./public"),
83
+ )
84
+ print(prefix("new_file ", json.dumps(new_file, indent=2)))
85
+ print(prefix("new_record ", json.dumps(new_record, indent=2)))
86
+ print(prefix("new_dir ", json.dumps(new_dir, indent=2)))
87
+
88
+ files = await index.get_files()
89
+ for file in files:
90
+ print(prefix("files", f"{len(file['content']):10}B {file['path']}"))
91
+
92
+
93
+ if __name__ == "__main__":
94
+ asyncio.run(main())
95
+
96
+ ```
97
+ <!-- [[[end]]] -->
98
+
@@ -0,0 +1,75 @@
1
+ # `pagefind`
2
+ An async python API for the [pagefind](https://pagefind.app) binary.
3
+
4
+ ## Installation
5
+
6
+ ```sh
7
+ python3 -m pip install 'pagefind[bin]'
8
+ python3 -m pagefind --help
9
+ ```
10
+
11
+ ## Usage
12
+ <!--[[[cog
13
+ print("```py")
14
+ print(open('./src/tests/integration.py').read())
15
+ print("```")
16
+ ]]] -->
17
+ ```py
18
+ import asyncio
19
+ import json
20
+ import logging
21
+ import os
22
+ from pagefind.index import PagefindIndex, IndexConfig
23
+
24
+ logging.basicConfig(level=os.environ.get("LOG_LEVEL", "INFO"))
25
+ log = logging.getLogger(__name__)
26
+ html_content = (
27
+ "<html>"
28
+ " <body>"
29
+ " <main>"
30
+ " <h1>Example HTML</h1>"
31
+ " <p>This is an example HTML page.</p>"
32
+ " </main>"
33
+ " </body>"
34
+ "</html>"
35
+ )
36
+
37
+
38
+ def prefix(pre: str, s: str) -> str:
39
+ return pre + s.replace("\n", f"\n{pre}")
40
+
41
+
42
+ async def main():
43
+ config = IndexConfig(
44
+ root_selector="main", logfile="index.log", output_path="./output", verbose=True
45
+ )
46
+ async with PagefindIndex(config=config) as index:
47
+ log.debug("opened index")
48
+ new_file, new_record, new_dir = await asyncio.gather(
49
+ index.add_html_file(
50
+ content=html_content,
51
+ url="https://example.com",
52
+ source_path="other/example.html",
53
+ ),
54
+ index.add_custom_record(
55
+ url="/elephants/",
56
+ content="Some testing content regarding elephants",
57
+ language="en",
58
+ meta={"title": "Elephants"},
59
+ ),
60
+ index.add_directory("./public"),
61
+ )
62
+ print(prefix("new_file ", json.dumps(new_file, indent=2)))
63
+ print(prefix("new_record ", json.dumps(new_record, indent=2)))
64
+ print(prefix("new_dir ", json.dumps(new_dir, indent=2)))
65
+
66
+ files = await index.get_files()
67
+ for file in files:
68
+ print(prefix("files", f"{len(file['content']):10}B {file['path']}"))
69
+
70
+
71
+ if __name__ == "__main__":
72
+ asyncio.run(main())
73
+
74
+ ```
75
+ <!-- [[[end]]] -->
@@ -0,0 +1,67 @@
1
+
2
+ [tool.poetry]
3
+ name = "pagefind"
4
+ version = "1.2.0"
5
+ # note this^^^^^^^ version will be replaced by scripts/build/api_package.py
6
+ description = "Python API for Pagefind"
7
+ authors = ["CloudCannon"]
8
+ license = "MIT"
9
+ readme = "README.md"
10
+ include = []
11
+ exclude = [
12
+ "dist",
13
+ "output",
14
+ "*.whl",
15
+ "*.egg-info",
16
+ "*.log",
17
+ ".venv",
18
+ "pagefind_python_bin" # poetry has a *.pth file in its .venv that causes
19
+ # directories in src/ to be preferentially imported. To allow testing
20
+ # `import pagefind_bin`, we use ./src/pagefind_python_bin as a workaround.
21
+ ]
22
+ classifiers = [
23
+ "License :: OSI Approved :: MIT License",
24
+ "Topic :: Text Processing :: Indexing",
25
+ "Topic :: Text Processing :: Markup :: HTML",
26
+ ]
27
+
28
+ # Note: we *aren't* including an `entry-points` section here to avoid clobbering
29
+ # the user's natively-installed `pagefind` binary. Using `python3 -m pagefind`
30
+ # is an informatively-namespaced alternative that doesn't add too many keystrokes.
31
+ # See https://packaging.python.org/en/latest/specifications/entry-points/
32
+
33
+ [tool.poetry.dependencies]
34
+ python = ">=3.9"
35
+
36
+ # during the building of the `pagefind` python package, the pagefind binary packages
37
+ # aren't yet published. Thus, `poetry lock` will fail if we include them here.
38
+ # However, `poetry build` fails to include the binary package extras in
39
+ # `pagefind`'s distribution info if these lines are commented out. Thus,
40
+ # we temporarily uncomment these lines during the build process, and then re-comment
41
+ # them afterwards
42
+
43
+ # these next two lines are owned by ./scripts/build/api_package.py
44
+ pagefind_bin = { version = "~=1.2.0", optional = true }
45
+ pagefind_bin_extended = { version = "~=1.2.0", optional = true }
46
+
47
+ [tool.poetry.extras]
48
+ bin = ["pagefind_bin"]
49
+ extended = ["pagefind_bin_extended"]
50
+
51
+ [tool.poetry.group.dev.dependencies]
52
+ ruff = "^0.5.0"
53
+ mypy = "^1.10.1"
54
+ wheel = "^0.43.0"
55
+ cogapp = "^3.4.1"
56
+ twine = "^5.1.1"
57
+ docutils = "^0.21.2"
58
+
59
+ [build-system]
60
+ requires = ["poetry-core"]
61
+ build-backend = "poetry.core.masonry.api"
62
+ # note that poetry can currently only build `purelib`s, or pure-python wheels.
63
+ # (see https://python-poetry.org/docs/cli#build)
64
+ # This means poetry can't handle building wheels that contain pagefind's binaries,
65
+ # which are necessarily platform-dependent.
66
+ # For more information on purelibs/pure-python wheels, see
67
+ # https://peps.python.org/pep-0427/#what-s-the-deal-with-purelib-vs-platlib
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env python3
2
+ # assume the python version is >= 3.9, which is the oldest LTS version with
3
+ # more 2 months of life as of the time of writing, 2024-08-18
4
+
5
+
6
+ # https://docs.python.org/3/reference/datamodel.html#async-context-managers
7
+ # https://docs.python.org/3/library/contextlib.html#contextlib.asynccontextmanager
8
+
9
+ # [[[cog
10
+ # import tomllib # ok since the development environment must be python >= 3.11
11
+ # from pathlib import Path
12
+ # pyproject = Path("pyproject.toml") # note the CWD is the project root
13
+ # assert pyproject.is_file(), f"expected {pyproject.absolute()} to be a file"
14
+ # version = tomllib.load(pyproject.open("rb"))["tool"]["poetry"]["version"]
15
+ # print(f'__version__ = "{version}"')
16
+ # ]]]
17
+ __version__ = "0.0.0a0"
18
+ # [[[end]]]
@@ -0,0 +1,13 @@
1
+ import os
2
+ import sys
3
+
4
+ from .service import _must_get_executable
5
+
6
+ bin = str(_must_get_executable().resolve().absolute())
7
+ argv = [bin, *sys.argv[1:]]
8
+ if os.name == "posix":
9
+ os.execv(bin, argv)
10
+ else:
11
+ import subprocess
12
+
13
+ sys.exit(subprocess.call(argv))
@@ -0,0 +1,274 @@
1
+ import logging
2
+ import base64
3
+ from typing import Any, Dict, List, Optional, Sequence, TypedDict, cast
4
+
5
+ from ..service import PagefindService
6
+ from ..service.types import (
7
+ InternalAddDirRequest,
8
+ InternalAddFileRequest,
9
+ InternalAddRecordRequest,
10
+ InternalDeleteIndexRequest,
11
+ InternalGetFilesRequest,
12
+ InternalGetFilesResponse,
13
+ InternalIndexedDirResponse,
14
+ InternalIndexedFileResponse,
15
+ InternalDecodedFile,
16
+ InternalWriteFilesRequest,
17
+ )
18
+
19
+ log = logging.getLogger(__name__)
20
+
21
+
22
+ class IndexConfig(TypedDict, total=False):
23
+ root_selector: Optional[str]
24
+ """
25
+ The root selector to use for the index.
26
+ If not supplied, Pagefind will use the ``<html>`` tag.
27
+ """
28
+ exclude_selectors: Optional[Sequence[str]]
29
+ """Extra element selectors that Pagefind should ignore when indexing."""
30
+ force_language: Optional[str]
31
+ """
32
+ Ignores any detected languages and creates a single index for the entire site as the
33
+ provided language. Expects an ISO 639-1 code, such as ``en`` or ``pt``.
34
+ """
35
+ verbose: Optional[bool]
36
+ """
37
+ Prints extra logging while indexing the site. Only affects the CLI, does not impact
38
+ web-facing search.
39
+ """
40
+ logfile: Optional[str]
41
+ """
42
+ A path to a file to log indexing output to in addition to stdout.
43
+ The file will be created if it doesn't exist and overwritten on each run.
44
+ """
45
+ keep_index_url: Optional[bool]
46
+ """Whether to keep ``index.html`` at the end of search result paths.
47
+
48
+ By default, a file at ``animals/cat/index.html`` will be given the URL
49
+ ``/animals/cat/``. Setting this option to ``true`` will result in the URL
50
+ ``/animals/cat/index.html``.
51
+ """
52
+ output_path: Optional[str]
53
+ """
54
+ The folder to output the search bundle into, relative to the processed site.
55
+ Defaults to ``pagefind``.
56
+ """
57
+
58
+
59
+ class PagefindIndex:
60
+ """Manages a Pagefind index.
61
+
62
+ ``PagefindIndex`` operates as an async contextmanager.
63
+ Entering the context starts a backing Pagefind service and creates an in-memory index in the backing service.
64
+ Exiting the context writes the in-memory index to disk and then shuts down the backing Pagefind service.
65
+
66
+ Each method of ``PagefindIndex`` that talks to the backing Pagefind service can raise errors.
67
+ If an exception is is rased inside ``PagefindIndex``'s context, the context closes without writing the index files to disk.
68
+
69
+ ``PagefindIndex`` optionally takes a configuration dictionary that can apply parts of the [Pagefind CLI config](/docs/config-options/). The options available at this level are:
70
+
71
+ See the relevant documentation for these configuration options in the
72
+ `Configuring the Pagefind CLI <https://pagefind.app/docs/config-options/>` documentation.
73
+ """
74
+
75
+ _service: Optional["PagefindService"] = None
76
+ _index_id: Optional[int] = None
77
+ _config: Optional[IndexConfig] = None
78
+ """Note that config should be immutable."""
79
+
80
+ def __init__(
81
+ self,
82
+ config: Optional[IndexConfig] = None,
83
+ *,
84
+ _service: Optional["PagefindService"] = None,
85
+ _index_id: Optional[int] = None,
86
+ ):
87
+ self._service = _service
88
+ self._index_id = _index_id
89
+ self._config = config
90
+
91
+ async def _start(self) -> "PagefindIndex":
92
+ """Start the backing Pagefind service and create an in-memory index."""
93
+ assert self._index_id is None
94
+ assert self._service is None
95
+ self._service = await PagefindService().launch()
96
+ _index = await self._service.create_index(self._config)
97
+ self._index_id = _index._index_id
98
+ return self
99
+
100
+ async def add_html_file(
101
+ self,
102
+ *,
103
+ content: str,
104
+ source_path: Optional[str] = None,
105
+ url: Optional[str] = None,
106
+ ) -> InternalIndexedFileResponse:
107
+ """Add an HTML file to the index.
108
+
109
+ :param content: The source HTML content of the file to be parsed.
110
+ :param source_path: The source path of the HTML file would have on disk. \
111
+ Must be a relative path, or an absolute path within the current working directory. \
112
+ Pagefind will compute the result URL from this path.
113
+ :param url: an explicit URL to use, instead of having Pagefind compute the \
114
+ URL based on the source_path. If not supplied, source_path must be supplied.
115
+ """
116
+ assert self._service is not None
117
+ assert self._index_id is not None
118
+ result = await self._service.send(
119
+ InternalAddFileRequest(
120
+ type="AddFile",
121
+ index_id=self._index_id,
122
+ url=url,
123
+ file_contents=content,
124
+ file_path=source_path,
125
+ )
126
+ )
127
+ assert result["type"] == "IndexedFile"
128
+ return cast(InternalIndexedFileResponse, result)
129
+
130
+ async def add_directory(
131
+ self, path: str, *, glob: Optional[str] = None
132
+ ) -> InternalIndexedDirResponse:
133
+ """Indexes a directory from disk using the standard Pagefind indexing behaviour.
134
+
135
+ This is equivalent to running the Pagefind binary with ``--site <dir>``.
136
+
137
+ :param path: the path to the directory to index. If the `path` provided is relative, \
138
+ it will be relative to the current working directory of your Python process.
139
+ :param glob: a glob pattern to filter files in the directory. If not provided, all \
140
+ files matching ``**.{html}`` are indexed. For more information on glob patterns, \
141
+ see the `Wax patterns documentation <https://github.com/olson-sean-k/wax#patterns>`.
142
+ """
143
+ assert self._service is not None
144
+ assert self._index_id is not None
145
+ result = await self._service.send(
146
+ InternalAddDirRequest(
147
+ type="AddDir",
148
+ index_id=self._index_id,
149
+ path=path,
150
+ glob=glob,
151
+ )
152
+ )
153
+ assert result["type"] == "IndexedDir"
154
+ return cast(InternalIndexedDirResponse, result)
155
+
156
+ async def get_files(self) -> List[InternalDecodedFile]:
157
+ """Get raw data of all files in the Pagefind index.
158
+
159
+ WATCH OUT: this method emits all files. This can be a lot of data, and
160
+ this amount of data can cause reading from the subprocess pipes to deadlock.
161
+
162
+ STRICTLY PREFER calling ``self.write_files()``.
163
+ """
164
+ assert self._service is not None
165
+ assert self._index_id is not None
166
+
167
+ response = await self._service.send(
168
+ InternalGetFilesRequest(type="GetFiles", index_id=self._index_id)
169
+ )
170
+ assert response["type"] == "GetFiles"
171
+ files = cast(InternalGetFilesResponse, response)["files"]
172
+
173
+ decoded_files = [
174
+ {"path": file["path"], "content": base64.b64decode(file["content"])}
175
+ for file in files
176
+ ]
177
+
178
+ return cast(List[InternalDecodedFile], decoded_files)
179
+
180
+ async def delete_index(self) -> None:
181
+ """
182
+ Deletes the data for the given index from its backing Pagefind service.
183
+ Doesn't affect any written files or data returned by ``get_files()``.
184
+ """
185
+ assert self._service is not None
186
+ assert self._index_id is not None
187
+ result = await self._service.send(
188
+ InternalDeleteIndexRequest(type="DeleteIndex", index_id=self._index_id)
189
+ )
190
+ assert result["type"] == "DeleteIndex"
191
+ self._index_id = None
192
+ self._service = None
193
+
194
+ async def add_custom_record(
195
+ self,
196
+ *,
197
+ url: str,
198
+ content: str,
199
+ language: str,
200
+ meta: Optional[Dict[str, str]] = None,
201
+ filters: Optional[Dict[str, List[str]]] = None,
202
+ sort: Optional[Dict[str, str]] = None,
203
+ ) -> InternalIndexedFileResponse:
204
+ """Add a direct record to the Pagefind index.
205
+
206
+ This method is useful for adding non-HTML content to the search results.
207
+
208
+ :param content: the raw content of this record.
209
+ :param url: the output URL of this record. Pagefind will not alter this.
210
+ :param language: ISO 639-1 code of the language this record is written in.
211
+ :param meta: the metadata to attach to this record. Supplying a ``title`` is highly recommended.
212
+ :param filters: the filters to attach to this record. Filters are used to group records together.
213
+ :param sort: the sort keys to attach to this record.
214
+ """
215
+ assert self._service is not None
216
+ assert self._index_id is not None
217
+ result = await self._service.send(
218
+ InternalAddRecordRequest(
219
+ type="AddRecord",
220
+ index_id=self._index_id,
221
+ url=url,
222
+ content=content,
223
+ language=language,
224
+ meta=meta,
225
+ filters=filters,
226
+ sort=sort,
227
+ )
228
+ )
229
+ assert result["type"] == "IndexedFile"
230
+ return cast(InternalIndexedFileResponse, result)
231
+
232
+ async def write_files(self, output_path: Optional[str] = None) -> None:
233
+ """Write the index files to disk.
234
+
235
+ If you're using PagefindIndex as a context manager, there's no need to call this method:
236
+ if no error occurred, closing the context automatically writes the index files to disk.
237
+
238
+ :param output_path: a path to override the configured output path for the index.
239
+ """
240
+ assert self._service is not None
241
+ assert self._index_id is not None
242
+ if not output_path:
243
+ if not self._config:
244
+ output_path = None
245
+ else:
246
+ output_path = self._config.get("output_path")
247
+
248
+ result = await self._service.send(
249
+ InternalWriteFilesRequest(
250
+ type="WriteFiles",
251
+ index_id=self._index_id,
252
+ output_path=output_path,
253
+ )
254
+ )
255
+ assert result["type"] == "WriteFiles"
256
+
257
+ async def __aenter__(self) -> "PagefindIndex":
258
+ assert self._service is None
259
+ assert self._index_id is None
260
+ return await self._start()
261
+
262
+ async def __aexit__(
263
+ self,
264
+ exc_type: Optional[Any],
265
+ exc_value: Optional[Any],
266
+ traceback: Optional[Any],
267
+ ) -> None:
268
+ if self._service is None:
269
+ return
270
+ if self._index_id is None:
271
+ return
272
+ if exc_type is None:
273
+ await self.write_files()
274
+ await self._service.close()
File without changes
@@ -0,0 +1,236 @@
1
+ import asyncio
2
+ import base64
3
+ import json
4
+ import logging
5
+ import os
6
+ import shutil
7
+ from contextlib import AbstractAsyncContextManager
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast
10
+
11
+ from .types import (
12
+ InternalNewIndexRequest,
13
+ InternalNewIndexResponse,
14
+ InternalRequestPayload,
15
+ InternalResponseError,
16
+ InternalResponsePayload,
17
+ InternalResponseType,
18
+ InternalServiceRequest,
19
+ InternalServiceResponse,
20
+ InternalSyntheticFile,
21
+ )
22
+
23
+ if TYPE_CHECKING:
24
+ from ..index import IndexConfig, PagefindIndex
25
+
26
+ log = logging.getLogger(__name__)
27
+
28
+
29
+ __all__ = ["PagefindService", "get_executable"]
30
+
31
+
32
+ def get_executable() -> Optional[Path]:
33
+ env_bin_path = os.getenv("PAGEFIND_BINARY_PATH")
34
+ if env_bin_path is not None:
35
+ log.debug(f"using {env_bin_path}")
36
+ return Path(env_bin_path)
37
+
38
+ try:
39
+ from pagefind_bin_extended import get_executable # type: ignore
40
+
41
+ extended: Path = get_executable()
42
+ log.debug(f"using {extended}")
43
+ return extended
44
+ except ImportError:
45
+ log.debug("unable to import pagefind_bin_extended")
46
+
47
+ try:
48
+ from pagefind_bin import get_executable # type: ignore
49
+
50
+ bin: Path = get_executable()
51
+ log.debug(f"using {bin}")
52
+ return bin
53
+ except ImportError:
54
+ log.debug("unable to import pagefind_bin")
55
+
56
+ external: Optional[str] = shutil.which("pagefind_extended")
57
+ external = external or shutil.which("pagefind")
58
+ if external is None:
59
+ log.debug("Could not find externally-installed pagefind binary")
60
+ return None
61
+ else:
62
+ log.debug(f"using {external}")
63
+ return Path(external)
64
+
65
+
66
+ def _must_get_executable() -> Path:
67
+ if (bin := get_executable()) is None:
68
+ raise FileNotFoundError("Could not find pagefind binary")
69
+ return bin
70
+
71
+
72
+ def _encode(req: InternalServiceRequest) -> bytes:
73
+ return base64.b64encode(json.dumps(req).encode("utf-8"))
74
+
75
+
76
+ class PagefindService(AbstractAsyncContextManager["PagefindService"]):
77
+ _bin: Path
78
+ _backend: asyncio.subprocess.Process
79
+ _message_id: int = 0
80
+ _responses: Dict[int, asyncio.Future[InternalResponsePayload]]
81
+ _loop: asyncio.AbstractEventLoop
82
+ _poll_task: asyncio.Task[None]
83
+
84
+ # _messages
85
+ def __init__(self) -> None:
86
+ self._loop = asyncio.get_event_loop()
87
+ self._bin = _must_get_executable()
88
+ self._responses = dict()
89
+
90
+ async def launch(self) -> "PagefindService":
91
+ log.debug(f"launching {self._bin}")
92
+ # TODO: detach process on windows?
93
+ # creation_flags: int = 0
94
+ # if platform.system().lower() == "windows":
95
+ # creation_flags = subprocess.CREATE_NO_WINDOW | subprocess.CREATE_DETACHED
96
+ self._backend = await asyncio.create_subprocess_exec(
97
+ self._bin,
98
+ "--service",
99
+ # "--verbose", # <- verbose emits debug logs to stdout, which is also used for IPC
100
+ cwd=os.getcwd(),
101
+ stdin=asyncio.subprocess.PIPE,
102
+ stdout=asyncio.subprocess.PIPE,
103
+ stderr=asyncio.subprocess.STDOUT,
104
+ limit=2**21, # <- 2MiB
105
+ # anything less and the _wait_for_responses loop will hang
106
+ # due to the stdout pipes deadlocking due to the buffer filling up
107
+ )
108
+ log.debug(f"launched {self._bin}: {self._backend}.")
109
+ log.debug("polling for responses")
110
+ self._poll_task = self._loop.create_task(self._wait_for_responses())
111
+ log.debug(f"polling task created: {self._poll_task}")
112
+ return self
113
+
114
+ async def send(self, payload: InternalRequestPayload) -> InternalResponsePayload:
115
+ self._message_id += 1
116
+ message_id = self._message_id
117
+ if (_ := self._responses.get(message_id)) is not None:
118
+ raise KeyError(f"message_id {message_id} already in use")
119
+ else:
120
+ future: asyncio.Future[InternalResponsePayload] = self._loop.create_future()
121
+ self._responses[message_id] = future
122
+ # FIXME: check stdin not none?
123
+ if self._backend.stdin is None:
124
+ # restart the backend
125
+ log.debug("restarting backend")
126
+ await self.launch()
127
+ log.debug("backend restarted")
128
+ assert self._backend.stdin is not None
129
+ req = InternalServiceRequest(message_id=message_id, payload=payload)
130
+ log.debug(f"sending request: {req}")
131
+ self._backend.stdin.write(_encode(req) + b",")
132
+ # backend waits for a comma before responding
133
+ await self._backend.stdin.drain()
134
+ log.debug(f"request sent: {req}")
135
+ result = await future
136
+ if result["type"] == InternalResponseType.GET_FILES.value: # these are HUGE
137
+ if (files := result.get("files")) is not None:
138
+ files = cast(List[InternalSyntheticFile], files)
139
+ base64_ch = sum(len(file["content"]) for file in files)
140
+ log.debug(f"received response: <{len(files)} files, {base64_ch} chars>")
141
+ else:
142
+ log.debug(f"received response: {result}")
143
+ return result
144
+
145
+ async def _wait_for_responses(self) -> None:
146
+ """
147
+ Poll the subprocess's stdout for responses
148
+ """
149
+ while True:
150
+ await asyncio.sleep(0.1)
151
+ assert self._backend.stdout is not None
152
+ log.debug("checking for data")
153
+ output = await self._backend.stdout.readuntil(b",")
154
+ if len(output) <= 200:
155
+ log.debug(f"received data: {output!r}")
156
+ else:
157
+ log.debug(
158
+ f"received data: {output[:30]!r}...{len(output) - 40}B...{output[-10:]!r}"
159
+ )
160
+ if (resp := json.loads(base64.b64decode(output[:-1]))) is None:
161
+ continue
162
+ resp = cast(InternalServiceResponse, resp)
163
+ message_id = resp.get("message_id")
164
+ if message_id is None:
165
+ # If the backend service failed to parse the message, it won't return the ID
166
+ # However it does return the message itself, so we can retrieve the ID we sent
167
+ if (orginal := resp["payload"].get("original_message")) is not None:
168
+ if (sent := json.loads(orginal)) is not None:
169
+ message_id = sent.get("message_id")
170
+ if message_id is not None:
171
+ log.debug(f"received response for message {message_id}")
172
+ assert (
173
+ self._message_id >= message_id
174
+ ), f"message_id out of order: incoming {message_id} > current: {self._message_id}"
175
+ if (future := self._responses.get(message_id)) is not None:
176
+ log.debug(f"resolving future for message {message_id}")
177
+ payload = resp["payload"]
178
+ if payload["type"] == InternalResponseType.ERROR.value:
179
+ exc = cast(InternalResponseError, payload)
180
+ future.set_exception(
181
+ Exception(exc["message"], exc.get("original_message"))
182
+ )
183
+ else:
184
+ future.set_result(cast(InternalResponsePayload, payload))
185
+ else:
186
+ log.debug(f"no receiving future for message {message_id}")
187
+ # FIXME: figure out how to surface the error
188
+ payload = cast(InternalResponseError, resp["payload"])
189
+ # assert (
190
+ # payload["type"] == InternalResponseType.ERROR.value
191
+ # ), f"unexpected message type: {payload['type']}"
192
+
193
+ async def close(self) -> None:
194
+ # wait for all _responses to be resolved
195
+ log.debug("waiting for all responses to be resolved")
196
+ try:
197
+ # wait at most 5s for all responses to be resolved
198
+ async with asyncio.timeout(5):
199
+ await asyncio.gather(*self._responses.values())
200
+ log.debug("all responses resolved")
201
+ except asyncio.TimeoutError:
202
+ log.error("timed out waiting for responses to be resolved")
203
+ self._poll_task.cancel()
204
+ self._backend.terminate()
205
+ await self._backend.wait()
206
+ log.debug("backend terminated")
207
+
208
+ async def __aenter__(self) -> "PagefindService":
209
+ return await self.launch()
210
+
211
+ async def __aexit__(
212
+ self,
213
+ exc_type: Optional[Any],
214
+ exc_value: Optional[Any],
215
+ traceback: Optional[Any],
216
+ ) -> None:
217
+ await self.close()
218
+
219
+ async def create_index(
220
+ self, config: Optional["IndexConfig"] = None
221
+ ) -> "PagefindIndex":
222
+ from ..index import PagefindIndex
223
+
224
+ _config: Optional["IndexConfig"] = None
225
+ if config is not None:
226
+ _config = {**config} # clone the config to avoid modifying the original
227
+ _config.pop("output_path", None)
228
+
229
+ log.debug(f"creating index with config: {_config}")
230
+ result = await self.send(
231
+ InternalNewIndexRequest(type="NewIndex", config=_config)
232
+ )
233
+ log.debug(f"received response: {result}")
234
+ assert result["type"] == "NewIndex"
235
+ result = cast(InternalNewIndexResponse, result)
236
+ return PagefindIndex(config=config, _service=self, _index_id=result["index_id"])
@@ -0,0 +1,168 @@
1
+ from enum import Enum
2
+ from typing import Dict, List, Literal, Optional, Sequence, TypedDict, Union
3
+
4
+
5
+ class InternalRequestType(Enum):
6
+ NEW_INDEX = "NewIndex"
7
+ ADD_FILE = "AddFile"
8
+ ADD_RECORD = "AddRecord"
9
+ ADD_DIR = "AddDir"
10
+ WRITE_FILES = "WriteFiles"
11
+ GET_FILES = "GetFiles"
12
+ DELETE_INDEX = "DeleteIndex"
13
+
14
+
15
+ class InternalPagefindServiceConfig(TypedDict, total=False):
16
+ # FIXME: document
17
+ root_selector: Optional[str]
18
+ exclude_selectors: Optional[Sequence[str]]
19
+ force_language: Optional[str]
20
+ verbose: Optional[bool]
21
+ logfile: Optional[str]
22
+ keep_index_url: Optional[bool]
23
+
24
+
25
+ class InternalNewIndexRequest(TypedDict):
26
+ type: Literal["NewIndex"]
27
+ config: Optional[InternalPagefindServiceConfig]
28
+
29
+
30
+ class InternalAddFileRequest(TypedDict):
31
+ type: Literal["AddFile"]
32
+ index_id: int
33
+ """index_id must be positive."""
34
+ file_path: Optional[str]
35
+ url: Optional[str]
36
+ file_contents: str
37
+
38
+
39
+ class InternalAddRecordRequest(TypedDict):
40
+ type: Literal["AddRecord"]
41
+ index_id: int
42
+ """index_id must be positive."""
43
+ url: str
44
+ content: str
45
+ language: str
46
+ meta: Optional[Dict[str, str]]
47
+ filters: Optional[Dict[str, List[str]]]
48
+ sort: Optional[Dict[str, str]]
49
+
50
+
51
+ class InternalAddDirRequest(TypedDict, total=False):
52
+ type: Literal["AddDir"]
53
+ index_id: int
54
+ path: str # TODO: support Path
55
+ glob: Optional[str]
56
+
57
+
58
+ class InternalWriteFilesRequest(TypedDict, total=False):
59
+ type: Literal["WriteFiles"]
60
+ index_id: int
61
+ """index_id must be positive."""
62
+ output_path: Optional[str]
63
+
64
+
65
+ class InternalGetFilesRequest(TypedDict):
66
+ type: Literal["GetFiles"]
67
+ index_id: int
68
+ """index_id must be positive."""
69
+
70
+
71
+ class InternalDeleteIndexRequest(TypedDict):
72
+ type: Literal["DeleteIndex"]
73
+ index_id: int
74
+ """index_id must be positive."""
75
+
76
+
77
+ InternalRequestPayload = Union[
78
+ InternalNewIndexRequest,
79
+ InternalAddFileRequest,
80
+ InternalAddRecordRequest,
81
+ InternalAddDirRequest,
82
+ InternalWriteFilesRequest,
83
+ InternalGetFilesRequest,
84
+ InternalDeleteIndexRequest,
85
+ ]
86
+
87
+
88
+ class InternalServiceRequest(TypedDict):
89
+ message_id: Optional[int]
90
+ payload: InternalRequestPayload
91
+
92
+
93
+ class InternalResponseType(Enum):
94
+ NEW_INDEX = "NewIndex"
95
+ INDEXED_FILE = "IndexedFile"
96
+ INDEXED_DIR = "IndexedDir"
97
+ WRITE_FILES = "WriteFiles"
98
+ GET_FILES = "GetFiles"
99
+ DELETE_INDEX = "DeleteIndex"
100
+ ERROR = "Error"
101
+
102
+
103
+ class InternalResponseError(TypedDict):
104
+ type: Literal["Error"]
105
+ message: str
106
+ original_message: Optional[str]
107
+
108
+
109
+ class InternalNewIndexResponse(TypedDict):
110
+ type: Literal["NewIndex"]
111
+ index_id: int
112
+
113
+
114
+ class InternalIndexedFileResponse(TypedDict):
115
+ type: Literal["IndexedFile"]
116
+ page_word_count: int
117
+ page_url: str
118
+ page_meta: Dict[str, str]
119
+
120
+
121
+ class InternalIndexedDirResponse(TypedDict):
122
+ type: str
123
+ page_count: int
124
+
125
+
126
+ class InternalWriteFilesResponse(TypedDict):
127
+ type: Literal["IndexedFile"]
128
+ output_path: str
129
+
130
+
131
+ class InternalSyntheticFile(TypedDict):
132
+ path: str
133
+ content: str
134
+
135
+
136
+ class InternalDecodedFile(TypedDict):
137
+ path: str
138
+ content: bytes
139
+
140
+
141
+ class InternalGetFilesResponse(TypedDict):
142
+ type: Literal["GetFiles"]
143
+ files: List[InternalSyntheticFile]
144
+
145
+
146
+ class InternalDeleteIndexResponse(TypedDict):
147
+ type: Literal["DeleteIndex"]
148
+
149
+
150
+ InternalResponsePayload = Union[
151
+ InternalNewIndexResponse,
152
+ InternalIndexedFileResponse,
153
+ InternalIndexedDirResponse,
154
+ InternalWriteFilesResponse,
155
+ InternalGetFilesResponse,
156
+ InternalDeleteIndexResponse,
157
+ ]
158
+
159
+
160
+ class InternalServiceResponse(TypedDict):
161
+ message_id: Optional[int]
162
+ payload: Union[InternalResponsePayload, InternalResponseError]
163
+
164
+
165
+ class InternalResponseCallback(TypedDict, total=False):
166
+ exception: Optional[Exception]
167
+ err: Optional[InternalResponseError]
168
+ result: Optional[InternalResponsePayload]