pageserve 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,221 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ *.lcov
51
+ .hypothesis/
52
+ .pytest_cache/
53
+ cover/
54
+
55
+ # Translations
56
+ *.mo
57
+ *.pot
58
+
59
+ # Django stuff:
60
+ *.log
61
+ local_settings.py
62
+ db.sqlite3
63
+ db.sqlite3-journal
64
+
65
+ # Flask stuff:
66
+ instance/
67
+ .webassets-cache
68
+
69
+ # Scrapy stuff:
70
+ .scrapy
71
+
72
+ # Sphinx documentation
73
+ docs/_build/
74
+ docs_internal
75
+
76
+ # PyBuilder
77
+ .pybuilder/
78
+ target/
79
+
80
+ # Jupyter Notebook
81
+ .ipynb_checkpoints
82
+
83
+ # IPython
84
+ profile_default/
85
+ ipython_config.py
86
+
87
+ # pyenv
88
+ # For a library or package, you might want to ignore these files since the code is
89
+ # intended to run in multiple environments; otherwise, check them in:
90
+ # .python-version
91
+
92
+ # pipenv
93
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
95
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
96
+ # install all needed dependencies.
97
+ # Pipfile.lock
98
+
99
+ # UV
100
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
101
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
102
+ # commonly ignored for libraries.
103
+ # uv.lock
104
+
105
+ # poetry
106
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
107
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
108
+ # commonly ignored for libraries.
109
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
110
+ # poetry.lock
111
+ # poetry.toml
112
+
113
+ # pdm
114
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
115
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
116
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
117
+ # pdm.lock
118
+ # pdm.toml
119
+ .pdm-python
120
+ .pdm-build/
121
+
122
+ # pixi
123
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
124
+ # pixi.lock
125
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
126
+ # in the .venv directory. It is recommended not to include this directory in version control.
127
+ .pixi/*
128
+ !.pixi/config.toml
129
+
130
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
131
+ __pypackages__/
132
+
133
+ # Celery stuff
134
+ celerybeat-schedule*
135
+ celerybeat.pid
136
+
137
+ # Redis
138
+ *.rdb
139
+ *.aof
140
+ *.pid
141
+
142
+ # RabbitMQ
143
+ mnesia/
144
+ rabbitmq/
145
+ rabbitmq-data/
146
+
147
+ # ActiveMQ
148
+ activemq-data/
149
+
150
+ # SageMath parsed files
151
+ *.sage.py
152
+
153
+ # Environments
154
+ .env
155
+ .envrc
156
+ .venv
157
+ env/
158
+ venv/
159
+ ENV/
160
+ env.bak/
161
+ venv.bak/
162
+
163
+ # Spyder project settings
164
+ .spyderproject
165
+ .spyproject
166
+
167
+ # Rope project settings
168
+ .ropeproject
169
+
170
+ # mkdocs documentation
171
+ /site
172
+
173
+ # mypy
174
+ .mypy_cache/
175
+ .dmypy.json
176
+ dmypy.json
177
+
178
+ # Pyre type checker
179
+ .pyre/
180
+
181
+ # pytype static type analyzer
182
+ .pytype/
183
+
184
+ # Cython debug symbols
185
+ cython_debug/
186
+
187
+ # PyCharm
188
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
189
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
190
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
191
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
192
+ # .idea/
193
+
194
+ # Abstra
195
+ # Abstra is an AI-powered process automation framework.
196
+ # Ignore directories containing user credentials, local state, and settings.
197
+ # Learn more at https://abstra.io/docs
198
+ .abstra/
199
+
200
+ # Visual Studio Code
201
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
202
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
203
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
204
+ # you could uncomment the following to ignore the entire vscode folder
205
+ # .vscode/
206
+ # Temporary file for partial code execution
207
+ tempCodeRunnerFile.py
208
+
209
+ # Ruff stuff:
210
+ .ruff_cache/
211
+
212
+ # PyPI configuration file
213
+ .pypirc
214
+
215
+ # Marimo
216
+ marimo/_static/
217
+ marimo/_lsp/
218
+ __marimo__/
219
+
220
+ # Streamlit
221
+ .streamlit/secrets.toml
@@ -0,0 +1,208 @@
1
+ Metadata-Version: 2.4
2
+ Name: pageserve
3
+ Version: 0.1.0
4
+ Summary: Python SDK and MCP server for self-hosted PageIndex RAG service
5
+ Project-URL: Homepage, https://github.com/pageserve/pageserve
6
+ Project-URL: Repository, https://github.com/pageserve/pageserve
7
+ Project-URL: Issues, https://github.com/pageserve/pageserve/issues
8
+ Project-URL: Documentation, https://github.com/pageserve/pageserve#readme
9
+ Author-email: pageserve <pageserve03@gmail.com>
10
+ License: Apache-2.0
11
+ Keywords: document-ai,llm,mcp,pageindex,rag,self-host,vector-free
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Requires-Python: >=3.10
22
+ Requires-Dist: httpx>=0.27.0
23
+ Requires-Dist: pydantic>=2.0
24
+ Provides-Extra: all
25
+ Requires-Dist: click>=8.0; extra == 'all'
26
+ Requires-Dist: mcp>=1.9.4; extra == 'all'
27
+ Provides-Extra: cli
28
+ Requires-Dist: click>=8.0; extra == 'cli'
29
+ Provides-Extra: dev
30
+ Requires-Dist: hatch; extra == 'dev'
31
+ Requires-Dist: mypy; extra == 'dev'
32
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
33
+ Requires-Dist: pytest-cov; extra == 'dev'
34
+ Requires-Dist: pytest>=8.0; extra == 'dev'
35
+ Requires-Dist: respx>=0.21; extra == 'dev'
36
+ Requires-Dist: ruff; extra == 'dev'
37
+ Provides-Extra: mcp
38
+ Requires-Dist: mcp>=1.9.4; extra == 'mcp'
39
+ Description-Content-Type: text/markdown
40
+
41
+ # pageserve
42
+
43
+ Python SDK and MCP server for [PageIndex](https://github.com/VectifyAI/PageIndex) self-hosted RAG service.
44
+
45
+ PageIndex is a reasoning-based RAG engine that navigates document structure rather than doing vector similarity search — it reads the table of contents, picks the right sections, and synthesizes answers with page-level citations.
46
+
47
+ ## Installation
48
+
49
+ ```bash
50
+ pip install pageserve # SDK only (sync + async)
51
+ pip install "pageserve[mcp]" # + MCP server for agent frameworks
52
+ pip install "pageserve[cli]" # + CLI commands
53
+ pip install "pageserve[all]" # everything
54
+ ```
55
+
56
+ ## Quick start
57
+
58
+ ```python
59
+ from pageserve import PageServeClient
60
+
61
+ client = PageServeClient(
62
+ base_url = "https://pageindex.company.com",
63
+ public_key = "<your-public-key>",
64
+ secret_key = "<your-secret-key>",
65
+ )
66
+
67
+ # List indexed documents
68
+ docs = client.list_documents()
69
+
70
+ # Ask a question
71
+ result = client.query(docs[0].doc_id, "What are the probation terms?")
72
+ print(result.answer)
73
+ print(result.citation) # "Employment Contract p.5, 6"
74
+ print(result.page_refs) # [5, 6]
75
+
76
+ # Upload and wait for indexing to complete
77
+ upload = client.upload("./contract.pdf", wait=True)
78
+
79
+ # Read specific pages (no LLM, instant)
80
+ pages = client.get_pages(docs[0].doc_id, "22-24")
81
+ ```
82
+
83
+ ## Async usage
84
+
85
+ ```python
86
+ import asyncio
87
+ from pageserve import AsyncPageServeClient
88
+
89
+ async def main():
90
+ async with AsyncPageServeClient(
91
+ base_url = "https://pageindex.company.com",
92
+ public_key = "<your-public-key>",
93
+ secret_key = "<your-secret-key>",
94
+ ) as client:
95
+ docs = await client.list_documents()
96
+
97
+ # Query multiple documents concurrently
98
+ results = await client.query_many([
99
+ (docs[0].doc_id, "What are the probation terms?"),
100
+ (docs[1].doc_id, "What does the law say about probation?"),
101
+ ])
102
+ for r in results:
103
+ print(r.answer)
104
+
105
+ asyncio.run(main())
106
+ ```
107
+
108
+ ## Streaming
109
+
110
+ ```python
111
+ for event in client.query_stream(doc_id, "What are the key clauses?"):
112
+ if event.type == "token":
113
+ print(event.content, end="", flush=True)
114
+ elif event.type == "sources":
115
+ for src in event.sources:
116
+ print(f"\nSource: {src.citation}")
117
+ elif event.type == "done":
118
+ break
119
+ ```
120
+
121
+ ## MCP server (Claude Desktop / Cursor)
122
+
123
+ Run the MCP server so Claude or any MCP-compatible agent can query your documents:
124
+
125
+ ```json
126
+ {
127
+ "mcpServers": {
128
+ "pageindex": {
129
+ "command": "pageserve",
130
+ "args": ["mcp"],
131
+ "env": {
132
+ "PAGESERVE_URL": "https://pageindex.company.com",
133
+ "PAGESERVE_PUBLIC_KEY": "<your-public-key>",
134
+ "PAGESERVE_SECRET_KEY": "<your-secret-key>"
135
+ }
136
+ }
137
+ }
138
+ }
139
+ ```
140
+
141
+ The MCP server exposes five tools:
142
+ - `list_documents` — see what documents are available
143
+ - `query_document` — ask a question against one document
144
+ - `query_multiple_documents` — cross-reference multiple documents
145
+ - `get_page_content` — read raw page text (no LLM, instant)
146
+ - `get_document_structure` — browse the table of contents
147
+
148
+ Keys live in env vars and never appear in tool arguments or the model's context window.
149
+
150
+ ## CLI
151
+
152
+ ```bash
153
+ export PAGESERVE_URL=https://pageindex.company.com
154
+ export PAGESERVE_PUBLIC_KEY=<your-public-key>
155
+ export PAGESERVE_SECRET_KEY=<your-secret-key>
156
+
157
+ pageserve list # list documents
158
+ pageserve query <doc_id> "question" # ask a question
159
+ pageserve query <doc_id> "question" --stream # streaming output
160
+ pageserve upload ./report.pdf --watch # upload + progress bar
161
+ pageserve health # service status
162
+ pageserve keys list # list API keys
163
+ pageserve keys create "My App" # create a key
164
+ pageserve mcp # run MCP server (stdio)
165
+ pageserve mcp --transport sse --port 3000 # MCP over SSE
166
+ ```
167
+
168
+ ## Error handling
169
+
170
+ ```python
171
+ from pageserve import (
172
+ AuthError,
173
+ NotFoundError,
174
+ DocumentNotReadyError,
175
+ FileTooLargeError,
176
+ RateLimitError,
177
+ ServiceUnavailableError,
178
+ ServiceError,
179
+ )
180
+
181
+ try:
182
+ result = client.query(doc_id, "question")
183
+ except AuthError:
184
+ print("Invalid or expired API key")
185
+ except NotFoundError:
186
+ print("Document not found")
187
+ except RateLimitError as e:
188
+ import time; time.sleep(e.retry_after)
189
+ except ServiceError as e:
190
+ print(f"Server error [{e.status_code}]")
191
+ ```
192
+
193
+ ## Documentation
194
+
195
+ - [Getting Started](docs/getting-started.md)
196
+ - [Authentication](docs/authentication.md)
197
+ - [Sync Client Reference](docs/sync-client.md)
198
+ - [Async Client Reference](docs/async-client.md)
199
+ - [Streaming (SSE)](docs/streaming.md)
200
+ - [MCP Server](docs/mcp-server.md)
201
+ - [CLI Reference](docs/cli.md)
202
+ - [Data Models](docs/models.md)
203
+ - [Error Handling](docs/error-handling.md)
204
+
205
+ ## Related
206
+
207
+ - [PageIndex OSS](https://github.com/VectifyAI/PageIndex) — the self-hosted service this SDK wraps
208
+ - [PageIndex Cloud](https://pageindex.ai) — hosted version
@@ -0,0 +1,168 @@
1
+ # pageserve
2
+
3
+ Python SDK and MCP server for [PageIndex](https://github.com/VectifyAI/PageIndex) self-hosted RAG service.
4
+
5
+ PageIndex is a reasoning-based RAG engine that navigates document structure rather than doing vector similarity search — it reads the table of contents, picks the right sections, and synthesizes answers with page-level citations.
6
+
7
+ ## Installation
8
+
9
+ ```bash
10
+ pip install pageserve # SDK only (sync + async)
11
+ pip install "pageserve[mcp]" # + MCP server for agent frameworks
12
+ pip install "pageserve[cli]" # + CLI commands
13
+ pip install "pageserve[all]" # everything
14
+ ```
15
+
16
+ ## Quick start
17
+
18
+ ```python
19
+ from pageserve import PageServeClient
20
+
21
+ client = PageServeClient(
22
+ base_url = "https://pageindex.company.com",
23
+ public_key = "<your-public-key>",
24
+ secret_key = "<your-secret-key>",
25
+ )
26
+
27
+ # List indexed documents
28
+ docs = client.list_documents()
29
+
30
+ # Ask a question
31
+ result = client.query(docs[0].doc_id, "What are the probation terms?")
32
+ print(result.answer)
33
+ print(result.citation) # "Employment Contract p.5, 6"
34
+ print(result.page_refs) # [5, 6]
35
+
36
+ # Upload and wait for indexing to complete
37
+ upload = client.upload("./contract.pdf", wait=True)
38
+
39
+ # Read specific pages (no LLM, instant)
40
+ pages = client.get_pages(docs[0].doc_id, "22-24")
41
+ ```
42
+
43
+ ## Async usage
44
+
45
+ ```python
46
+ import asyncio
47
+ from pageserve import AsyncPageServeClient
48
+
49
+ async def main():
50
+ async with AsyncPageServeClient(
51
+ base_url = "https://pageindex.company.com",
52
+ public_key = "<your-public-key>",
53
+ secret_key = "<your-secret-key>",
54
+ ) as client:
55
+ docs = await client.list_documents()
56
+
57
+ # Query multiple documents concurrently
58
+ results = await client.query_many([
59
+ (docs[0].doc_id, "What are the probation terms?"),
60
+ (docs[1].doc_id, "What does the law say about probation?"),
61
+ ])
62
+ for r in results:
63
+ print(r.answer)
64
+
65
+ asyncio.run(main())
66
+ ```
67
+
68
+ ## Streaming
69
+
70
+ ```python
71
+ for event in client.query_stream(doc_id, "What are the key clauses?"):
72
+ if event.type == "token":
73
+ print(event.content, end="", flush=True)
74
+ elif event.type == "sources":
75
+ for src in event.sources:
76
+ print(f"\nSource: {src.citation}")
77
+ elif event.type == "done":
78
+ break
79
+ ```
80
+
81
+ ## MCP server (Claude Desktop / Cursor)
82
+
83
+ Run the MCP server so Claude or any MCP-compatible agent can query your documents:
84
+
85
+ ```json
86
+ {
87
+ "mcpServers": {
88
+ "pageindex": {
89
+ "command": "pageserve",
90
+ "args": ["mcp"],
91
+ "env": {
92
+ "PAGESERVE_URL": "https://pageindex.company.com",
93
+ "PAGESERVE_PUBLIC_KEY": "<your-public-key>",
94
+ "PAGESERVE_SECRET_KEY": "<your-secret-key>"
95
+ }
96
+ }
97
+ }
98
+ }
99
+ ```
100
+
101
+ The MCP server exposes five tools:
102
+ - `list_documents` — see what documents are available
103
+ - `query_document` — ask a question against one document
104
+ - `query_multiple_documents` — cross-reference multiple documents
105
+ - `get_page_content` — read raw page text (no LLM, instant)
106
+ - `get_document_structure` — browse the table of contents
107
+
108
+ Keys live in env vars and never appear in tool arguments or the model's context window.
109
+
110
+ ## CLI
111
+
112
+ ```bash
113
+ export PAGESERVE_URL=https://pageindex.company.com
114
+ export PAGESERVE_PUBLIC_KEY=<your-public-key>
115
+ export PAGESERVE_SECRET_KEY=<your-secret-key>
116
+
117
+ pageserve list # list documents
118
+ pageserve query <doc_id> "question" # ask a question
119
+ pageserve query <doc_id> "question" --stream # streaming output
120
+ pageserve upload ./report.pdf --watch # upload + progress bar
121
+ pageserve health # service status
122
+ pageserve keys list # list API keys
123
+ pageserve keys create "My App" # create a key
124
+ pageserve mcp # run MCP server (stdio)
125
+ pageserve mcp --transport sse --port 3000 # MCP over SSE
126
+ ```
127
+
128
+ ## Error handling
129
+
130
+ ```python
131
+ from pageserve import (
132
+ AuthError,
133
+ NotFoundError,
134
+ DocumentNotReadyError,
135
+ FileTooLargeError,
136
+ RateLimitError,
137
+ ServiceUnavailableError,
138
+ ServiceError,
139
+ )
140
+
141
+ try:
142
+ result = client.query(doc_id, "question")
143
+ except AuthError:
144
+ print("Invalid or expired API key")
145
+ except NotFoundError:
146
+ print("Document not found")
147
+ except RateLimitError as e:
148
+ import time; time.sleep(e.retry_after)
149
+ except ServiceError as e:
150
+ print(f"Server error [{e.status_code}]")
151
+ ```
152
+
153
+ ## Documentation
154
+
155
+ - [Getting Started](docs/getting-started.md)
156
+ - [Authentication](docs/authentication.md)
157
+ - [Sync Client Reference](docs/sync-client.md)
158
+ - [Async Client Reference](docs/async-client.md)
159
+ - [Streaming (SSE)](docs/streaming.md)
160
+ - [MCP Server](docs/mcp-server.md)
161
+ - [CLI Reference](docs/cli.md)
162
+ - [Data Models](docs/models.md)
163
+ - [Error Handling](docs/error-handling.md)
164
+
165
+ ## Related
166
+
167
+ - [PageIndex OSS](https://github.com/VectifyAI/PageIndex) — the self-hosted service this SDK wraps
168
+ - [PageIndex Cloud](https://pageindex.ai) — hosted version
@@ -0,0 +1,68 @@
1
+ from pageserve._async_client import AsyncPageServeClient
2
+ from pageserve._client import PageServeClient
3
+ from pageserve._exceptions import (
4
+ AuthError,
5
+ DocumentNotReadyError,
6
+ FileTooLargeError,
7
+ InsufficientStorageError,
8
+ NotFoundError,
9
+ PageServeError,
10
+ RateLimitError,
11
+ ServiceError,
12
+ ServiceUnavailableError,
13
+ TimeoutError,
14
+ )
15
+ from pageserve._models import (
16
+ ApiKey,
17
+ CreatedApiKey,
18
+ Document,
19
+ DocumentList,
20
+ HealthResult,
21
+ IndexProgress,
22
+ Page,
23
+ QueryResult,
24
+ QuerySource,
25
+ SSEEvent,
26
+ Stats,
27
+ StructureNode,
28
+ UploadResult,
29
+ Webhook,
30
+ WebhookTestResult,
31
+ )
32
+
33
+ __version__ = "0.1.0"
34
+ __author__ = "pageserve"
35
+ __license__ = "Apache-2.0"
36
+
37
+ __all__ = [
38
+ # Clients
39
+ "PageServeClient",
40
+ "AsyncPageServeClient",
41
+ # Models
42
+ "Document",
43
+ "DocumentList",
44
+ "UploadResult",
45
+ "StructureNode",
46
+ "Page",
47
+ "QueryResult",
48
+ "QuerySource",
49
+ "SSEEvent",
50
+ "IndexProgress",
51
+ "ApiKey",
52
+ "CreatedApiKey",
53
+ "Stats",
54
+ "Webhook",
55
+ "WebhookTestResult",
56
+ "HealthResult",
57
+ # Exceptions
58
+ "PageServeError",
59
+ "AuthError",
60
+ "NotFoundError",
61
+ "DocumentNotReadyError",
62
+ "FileTooLargeError",
63
+ "ServiceUnavailableError",
64
+ "InsufficientStorageError",
65
+ "RateLimitError",
66
+ "ServiceError",
67
+ "TimeoutError",
68
+ ]