pgsync 2.5.0__tar.gz → 3.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pgsync-3.1.0/LICENSE +21 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/PKG-INFO +36 -6
- {pgsync-2.5.0 → pgsync-3.1.0}/README.md +5 -5
- {pgsync-2.5.0 → pgsync-3.1.0}/README.rst +1 -1
- {pgsync-2.5.0 → pgsync-3.1.0}/bin/bootstrap +8 -2
- {pgsync-2.5.0 → pgsync-3.1.0}/bin/parallel_sync +104 -97
- {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/__init__.py +1 -1
- {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/base.py +202 -159
- {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/constants.py +14 -1
- {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/helper.py +18 -8
- {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/node.py +62 -48
- {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/plugin.py +16 -5
- {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/querybuilder.py +28 -46
- {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/redisqueue.py +5 -5
- {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/search_client.py +108 -76
- {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/settings.py +26 -6
- pgsync-3.1.0/pgsync/singleton.py +39 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/sync.py +137 -100
- {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/transform.py +20 -9
- {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/trigger.py +7 -1
- pgsync-3.1.0/pgsync/urls.py +145 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/utils.py +77 -26
- {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/view.py +215 -44
- {pgsync-2.5.0 → pgsync-3.1.0}/pgsync.egg-info/PKG-INFO +36 -6
- pgsync-3.1.0/pgsync.egg-info/requires.txt +29 -0
- pgsync-3.1.0/pyproject.toml +3 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/setup.cfg +1 -1
- {pgsync-2.5.0 → pgsync-3.1.0}/setup.py +6 -5
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/conftest.py +133 -82
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_base.py +96 -73
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_constants.py +1 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_node.py +24 -21
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_redisqueue.py +16 -16
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_search_client.py +5 -9
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_settings.py +1 -1
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_sync.py +23 -26
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_sync_nested_children.py +39 -44
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_sync_root.py +35 -55
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_sync_single_child_fk_on_child.py +35 -48
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_sync_single_child_fk_on_parent.py +35 -48
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_trigger.py +3 -2
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_unique_behaviour.py +2 -10
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_utils.py +4 -4
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_view.py +83 -55
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/testing_utils.py +5 -3
- pgsync-2.5.0/LICENSE +0 -165
- pgsync-2.5.0/pgsync/singleton.py +0 -20
- pgsync-2.5.0/pgsync/urls.py +0 -99
- pgsync-2.5.0/pgsync.egg-info/requires.txt +0 -43
- pgsync-2.5.0/pyproject.toml +0 -3
- {pgsync-2.5.0 → pgsync-3.1.0}/AUTHORS.rst +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/CONTRIBUTING.rst +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/HISTORY.rst +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/MANIFEST.in +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/bin/pgsync +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/docs/Makefile +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/docs/authors.rst +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/docs/changelog.rst +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/docs/conf.py +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/docs/contributing.rst +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/docs/history.rst +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/docs/index.rst +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/docs/installation.rst +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/docs/logo.png +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/docs/make.bat +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/docs/readme.rst +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/docs/usage.rst +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/pgsync/exc.py +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/pgsync.egg-info/SOURCES.txt +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/pgsync.egg-info/dependency_links.txt +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/pgsync.egg-info/not-zip-safe +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/pgsync.egg-info/top_level.txt +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/__init__.py +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/fixtures/schema.json +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_env_vars.py +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_helper.py +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_log_handlers.py +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_query_builder.py +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_transform.py +0 -0
- {pgsync-2.5.0 → pgsync-3.1.0}/tests/test_urls.py +0 -0
pgsync-3.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2023 Tolu Aina
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pgsync
|
|
3
|
-
Version:
|
|
3
|
+
Version: 3.1.0
|
|
4
4
|
Summary: Postgres to Elasticsearch/OpenSearch sync
|
|
5
5
|
Home-page: https://github.com/toluaina/pgsync
|
|
6
6
|
Author: Tolu Aina
|
|
7
7
|
Author-email: tolu@pgsync.com
|
|
8
8
|
Maintainer: Tolu Aina
|
|
9
9
|
Maintainer-email: tolu@pgsync.com
|
|
10
|
-
License:
|
|
10
|
+
License: MIT
|
|
11
11
|
Project-URL: Bug Reports, https://github.com/toluaina/pgsync/issues
|
|
12
12
|
Project-URL: Funding, https://github.com/sponsors/toluaina
|
|
13
13
|
Project-URL: Source, https://github.com/toluaina/pgsync
|
|
@@ -17,18 +17,48 @@ Keywords: pgsync,elasticsearch,opensearch,postgres,change data capture
|
|
|
17
17
|
Classifier: Development Status :: 5 - Production/Stable
|
|
18
18
|
Classifier: Intended Audience :: Developers
|
|
19
19
|
Classifier: Natural Language :: English
|
|
20
|
-
Classifier: Programming Language :: Python :: 3.7
|
|
21
20
|
Classifier: Programming Language :: Python :: 3.8
|
|
22
21
|
Classifier: Programming Language :: Python :: 3.9
|
|
23
22
|
Classifier: Programming Language :: Python :: 3.10
|
|
24
23
|
Classifier: Programming Language :: Python :: 3.11
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
25
25
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
26
26
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
27
|
-
Classifier: License :: OSI Approved ::
|
|
28
|
-
|
|
27
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
28
|
+
Classifier: Operating System :: OS Independent
|
|
29
|
+
Requires-Python: >=3.8.0
|
|
29
30
|
Description-Content-Type: text/markdown
|
|
30
31
|
License-File: LICENSE
|
|
31
32
|
License-File: AUTHORS.rst
|
|
33
|
+
Requires-Dist: async-timeout==4.0.3
|
|
34
|
+
Requires-Dist: boto3==1.34.11
|
|
35
|
+
Requires-Dist: botocore==1.34.11
|
|
36
|
+
Requires-Dist: certifi==2023.11.17
|
|
37
|
+
Requires-Dist: charset-normalizer==3.3.2
|
|
38
|
+
Requires-Dist: click==8.1.7
|
|
39
|
+
Requires-Dist: elastic-transport==8.11.0
|
|
40
|
+
Requires-Dist: elasticsearch==8.11.1
|
|
41
|
+
Requires-Dist: elasticsearch-dsl==8.11.0
|
|
42
|
+
Requires-Dist: environs==10.0.0
|
|
43
|
+
Requires-Dist: greenlet==3.0.3
|
|
44
|
+
Requires-Dist: idna==3.6
|
|
45
|
+
Requires-Dist: jmespath==1.0.1
|
|
46
|
+
Requires-Dist: marshmallow==3.20.1
|
|
47
|
+
Requires-Dist: opensearch-dsl==2.1.0
|
|
48
|
+
Requires-Dist: opensearch-py==2.4.2
|
|
49
|
+
Requires-Dist: packaging==23.2
|
|
50
|
+
Requires-Dist: psycopg2-binary==2.9.9
|
|
51
|
+
Requires-Dist: python-dateutil==2.8.2
|
|
52
|
+
Requires-Dist: python-dotenv==1.0.0
|
|
53
|
+
Requires-Dist: redis==5.0.1
|
|
54
|
+
Requires-Dist: requests==2.31.0
|
|
55
|
+
Requires-Dist: requests-aws4auth==1.2.3
|
|
56
|
+
Requires-Dist: s3transfer==0.10.0
|
|
57
|
+
Requires-Dist: six==1.16.0
|
|
58
|
+
Requires-Dist: sqlalchemy==2.0.25
|
|
59
|
+
Requires-Dist: sqlparse==0.4.4
|
|
60
|
+
Requires-Dist: typing-extensions==4.9.0
|
|
61
|
+
Requires-Dist: urllib3==1.26.18
|
|
32
62
|
|
|
33
63
|
# PostgreSQL to Elasticsearch/OpenSearch sync
|
|
34
64
|
|
|
@@ -40,7 +70,7 @@ expose structured denormalized documents in [Elasticsearch](https://www.elastic.
|
|
|
40
70
|
|
|
41
71
|
### Requirements
|
|
42
72
|
|
|
43
|
-
- [Python](https://www.python.org) 3.
|
|
73
|
+
- [Python](https://www.python.org) 3.8+
|
|
44
74
|
- [Postgres](https://www.postgresql.org) 9.6+
|
|
45
75
|
- [Redis](https://redis.io) 3.1.0
|
|
46
76
|
- [Elasticsearch](https://www.elastic.co/products/elastic-stack) 6.3.1+ or [OpenSearch](https://opensearch.org/) 1.3.7+
|
|
@@ -66,7 +66,7 @@ the search capabilities of [Elasticsearch](https://www.elastic.co/products/elast
|
|
|
66
66
|
|
|
67
67
|
#### How it works
|
|
68
68
|
|
|
69
|
-
PGSync is written in Python (supporting version 3.
|
|
69
|
+
PGSync is written in Python (supporting version 3.8 onwards) and the stack is composed of: [Redis](https://redis.io), [Elasticsearch](https://www.elastic.co/products/elastic-stack)/[OpenSearch](https://opensearch.org/), [Postgres](https://www.postgresql.org), and [SQlAlchemy](https://www.sqlalchemy.org).
|
|
70
70
|
|
|
71
71
|
PGSync leverages the [logical decoding](https://www.postgresql.org/docs/current/logicaldecoding.html) feature of [Postgres](https://www.postgresql.org) (introduced in PostgreSQL 9.4) to capture a continuous stream of change events.
|
|
72
72
|
This feature needs to be enabled in your [Postgres](https://www.postgresql.org) configuration file by setting in the postgresql.conf file:
|
|
@@ -152,7 +152,7 @@ Key features of PGSync are:
|
|
|
152
152
|
|
|
153
153
|
#### Requirements
|
|
154
154
|
|
|
155
|
-
- [Python](https://www.python.org) 3.
|
|
155
|
+
- [Python](https://www.python.org) 3.8+
|
|
156
156
|
- [Postgres](https://www.postgresql.org) 9.6+
|
|
157
157
|
- [Redis](https://redis.io) 3.1.0
|
|
158
158
|
- [Elasticsearch](https://www.elastic.co/products/elastic-stack) 6.3.1+ or [OpenSearch](https://opensearch.org/) 1.3.7+
|
|
@@ -305,8 +305,8 @@ Contributions are very welcome! Check out the [Contribution](CONTRIBUTING.rst) G
|
|
|
305
305
|
|
|
306
306
|
#### License
|
|
307
307
|
|
|
308
|
-
This
|
|
308
|
+
This project is licensed under the terms of the [MIT](https://opensource.org/license/mit/) license.
|
|
309
309
|
Please see [LICENSE](LICENSE) for more details.
|
|
310
310
|
|
|
311
|
-
You should have received a copy of the
|
|
312
|
-
If not, see https://
|
|
311
|
+
You should have received a copy of the MIT License along with PGSync.
|
|
312
|
+
If not, see https://opensource.org/license/mit/.
|
|
@@ -8,7 +8,7 @@ expose structured denormalized documents in [Elasticsearch](https://www.elastic.
|
|
|
8
8
|
|
|
9
9
|
### Requirements
|
|
10
10
|
|
|
11
|
-
- [Python](https://www.python.org) 3.
|
|
11
|
+
- [Python](https://www.python.org) 3.8+
|
|
12
12
|
- [Postgres](https://www.postgresql.org) 9.6+
|
|
13
13
|
- [Redis](https://redis.io) 3.1.0
|
|
14
14
|
- [Elasticsearch](https://www.elastic.co/products/elastic-stack) 6.3.1+ or [OpenSearch](https://opensearch.org/) 1.3.7+
|
|
@@ -54,9 +54,15 @@ def main(teardown, config, user, password, host, port, verbose):
|
|
|
54
54
|
|
|
55
55
|
show_settings(config)
|
|
56
56
|
|
|
57
|
-
|
|
57
|
+
validate: bool = False if teardown else True
|
|
58
|
+
|
|
59
|
+
for doc in config_loader(config):
|
|
58
60
|
sync: Sync = Sync(
|
|
59
|
-
|
|
61
|
+
doc,
|
|
62
|
+
verbose=verbose,
|
|
63
|
+
validate=validate,
|
|
64
|
+
repl_slots=False,
|
|
65
|
+
**kwargs,
|
|
60
66
|
)
|
|
61
67
|
if teardown:
|
|
62
68
|
sync.teardown()
|
|
@@ -1,44 +1,43 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
|
|
3
3
|
"""
|
|
4
|
-
Parallel sync is an experimental feature
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
query
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
and row numbers.
|
|
4
|
+
Parallel sync is an innovative, experimental feature designed to optimize
|
|
5
|
+
throughput by utilizing available CPUs/threads, particularly beneficial
|
|
6
|
+
in environments experiencing high network latency.
|
|
7
|
+
|
|
8
|
+
Scenario & Challenge:
|
|
9
|
+
In instances where your PG database, Elasticsearch/OpenSearch, and PGSync
|
|
10
|
+
servers operate on divergent networks, a delay in request/response time is
|
|
11
|
+
noticeable. The primary constraint emerges from the database query's roundtrip,
|
|
12
|
+
which even server-side cursors can address only to a limited extent by fetching
|
|
13
|
+
a certain number of records at a time. The consequent delay in fetching the
|
|
14
|
+
next cursor significantly hampers the overall synchronization speed.
|
|
15
|
+
|
|
16
|
+
Solution:
|
|
17
|
+
To mitigate this, the strategy is to conduct an initial fast/parallel sync,
|
|
18
|
+
thereby populating Elasticsearch/OpenSearch in a single iteration.
|
|
19
|
+
Post this, the regular pgsync can continue running as a daemon.
|
|
20
|
+
|
|
21
|
+
Approach and Technical Implementation:
|
|
22
|
+
The approach centers around utilizing the Tuple identifier record of the table
|
|
23
|
+
columns. Every table incorporates a system column – "ctid" of type "tid,"
|
|
24
|
+
which helps identify the page record and the row number in each block.
|
|
25
|
+
This element facilitates the pagination of the sync process.
|
|
26
|
+
|
|
27
|
+
Technically, pagination implies dividing each paged record amongst the
|
|
28
|
+
available CPUs/threads. This division enables the parallel execution of
|
|
29
|
+
Elasticsearch/OpenSearch bulk inserts. The "ctid" serves as a tuple
|
|
30
|
+
(for instance, (1, 5)), pinpointing the row in a disk page.
|
|
31
|
+
|
|
32
|
+
By leveraging this method, all paged row records are retrieved upfront and
|
|
33
|
+
allocated as work units across the worker threads/CPUs.
|
|
34
|
+
Each work unit, defined by the BLOCK_SIZE, denotes the number of root node
|
|
35
|
+
records assigned for each worker to process.
|
|
36
|
+
|
|
37
|
+
Subsequently, the workers execute queries for each assigned chunk of work,
|
|
38
|
+
filtered based on the page number and row numbers.
|
|
39
|
+
This systematic and parallel approach optimizes the synchronization process,
|
|
40
|
+
especially in environments challenged by network latency.
|
|
42
41
|
"""
|
|
43
42
|
|
|
44
43
|
import asyncio
|
|
@@ -46,39 +45,50 @@ import multiprocessing
|
|
|
46
45
|
import os
|
|
47
46
|
import re
|
|
48
47
|
import sys
|
|
48
|
+
import typing as t
|
|
49
49
|
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
|
50
50
|
from dataclasses import dataclass
|
|
51
51
|
from queue import Queue
|
|
52
52
|
from threading import Thread
|
|
53
|
-
from typing import Generator, Optional, Union
|
|
54
53
|
|
|
55
54
|
import click
|
|
56
55
|
import sqlalchemy as sa
|
|
57
56
|
|
|
58
57
|
from pgsync.settings import BLOCK_SIZE, CHECKPOINT_PATH
|
|
59
58
|
from pgsync.sync import Sync
|
|
60
|
-
from pgsync.utils import
|
|
61
|
-
|
|
62
|
-
config_loader,
|
|
63
|
-
get_config,
|
|
64
|
-
show_settings,
|
|
65
|
-
timeit,
|
|
66
|
-
)
|
|
59
|
+
from pgsync.utils import config_loader, get_config, show_settings, timeit
|
|
60
|
+
|
|
67
61
|
|
|
62
|
+
def save_ctid(page: int, row: int, filename: str) -> None:
|
|
63
|
+
"""
|
|
64
|
+
Save the checkpoint for a given page and row in a file with the given name.
|
|
68
65
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
66
|
+
Args:
|
|
67
|
+
page (int): The page number to save.
|
|
68
|
+
row (int): The row number to save.
|
|
69
|
+
filename (str): The name of the file to save the checkpoint in.
|
|
70
|
+
"""
|
|
71
|
+
filepath: str = os.path.join(CHECKPOINT_PATH, f".{filename}.ctid")
|
|
72
|
+
with open(filepath, "w+") as fp:
|
|
72
73
|
fp.write(f"{page},{row}\n")
|
|
73
74
|
|
|
74
75
|
|
|
75
|
-
def read_ctid(
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
76
|
+
def read_ctid(filename: str) -> t.Tuple[t.Optional[int], t.Optional[int]]:
|
|
77
|
+
"""
|
|
78
|
+
Reads the checkpoint file for the given name and returns the page and row numbers.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
filename (str): The name of the checkpoint file.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
tuple: A tuple containing the page and row numbers. If the checkpoint file does not exist, returns (None, None).
|
|
85
|
+
"""
|
|
86
|
+
filepath: str = os.path.join(CHECKPOINT_PATH, f".{filename}.ctid")
|
|
87
|
+
if os.path.exists(filepath):
|
|
88
|
+
with open(filepath, "r") as fp:
|
|
79
89
|
pairs: str = fp.read().split()[0].split(",")
|
|
80
|
-
page = int(pairs[0])
|
|
81
|
-
row = int(pairs[1])
|
|
90
|
+
page: int = int(pairs[0])
|
|
91
|
+
row: int = int(pairs[1])
|
|
82
92
|
return page, row
|
|
83
93
|
return None, None
|
|
84
94
|
|
|
@@ -104,7 +114,6 @@ class Task:
|
|
|
104
114
|
sync: Sync = Sync(
|
|
105
115
|
self.doc, verbose=self.verbose, validate=self.validate
|
|
106
116
|
)
|
|
107
|
-
sync.tree.build(sync.nodes)
|
|
108
117
|
txmin: int = sync.checkpoint
|
|
109
118
|
txmax: int = sync.txid_current
|
|
110
119
|
sync.search_client.bulk(
|
|
@@ -118,19 +127,19 @@ class Task:
|
|
|
118
127
|
@timeit
|
|
119
128
|
def fetch_tasks(
|
|
120
129
|
doc: dict,
|
|
121
|
-
block_size: Optional[int] = None,
|
|
122
|
-
) -> Generator:
|
|
130
|
+
block_size: t.Optional[int] = None,
|
|
131
|
+
) -> t.Generator:
|
|
123
132
|
block_size = block_size or BLOCK_SIZE
|
|
124
133
|
pages: dict = {}
|
|
125
134
|
sync: Sync = Sync(doc)
|
|
126
|
-
page: Optional[int] = None
|
|
127
|
-
row: Optional[int] = None
|
|
128
|
-
|
|
135
|
+
page: t.Optional[int] = None
|
|
136
|
+
row: t.Optional[int] = None
|
|
137
|
+
filename: str = re.sub(
|
|
129
138
|
"[^0-9a-zA-Z_]+", "", f"{sync.database.lower()}_{sync.index}"
|
|
130
139
|
)
|
|
131
|
-
page, row = read_ctid(
|
|
140
|
+
page, row = read_ctid(filename)
|
|
132
141
|
statement: sa.sql.Select = sa.select(
|
|
133
|
-
[
|
|
142
|
+
*[
|
|
134
143
|
sa.literal_column("1").label("x"),
|
|
135
144
|
sa.literal_column("1").label("y"),
|
|
136
145
|
sa.column("ctid"),
|
|
@@ -197,11 +206,13 @@ def fetch_tasks(
|
|
|
197
206
|
|
|
198
207
|
@timeit
|
|
199
208
|
def synchronous(
|
|
200
|
-
tasks: Generator,
|
|
209
|
+
tasks: t.Generator,
|
|
210
|
+
doc: dict,
|
|
211
|
+
verbose: bool = False,
|
|
212
|
+
validate: bool = False,
|
|
201
213
|
) -> None:
|
|
202
214
|
sys.stdout.write("Synchronous\n")
|
|
203
215
|
sync: Sync = Sync(doc, verbose=verbose, validate=validate)
|
|
204
|
-
sync.tree.build(sync.nodes)
|
|
205
216
|
txmin: int = sync.checkpoint
|
|
206
217
|
txmax: int = sync.txid_current
|
|
207
218
|
index: str = sync.index
|
|
@@ -215,9 +226,9 @@ def synchronous(
|
|
|
215
226
|
|
|
216
227
|
@timeit
|
|
217
228
|
def multithreaded(
|
|
218
|
-
tasks: Generator,
|
|
229
|
+
tasks: t.Generator,
|
|
219
230
|
doc: dict,
|
|
220
|
-
|
|
231
|
+
nthreads: t.Optional[int] = None,
|
|
221
232
|
verbose: bool = False,
|
|
222
233
|
validate: bool = False,
|
|
223
234
|
) -> None:
|
|
@@ -234,12 +245,11 @@ def multithreaded(
|
|
|
234
245
|
)
|
|
235
246
|
queue.task_done()
|
|
236
247
|
|
|
237
|
-
|
|
248
|
+
nthreads: int = nthreads or 1
|
|
238
249
|
queue: Queue = Queue()
|
|
239
250
|
sync: Sync = Sync(doc, verbose=verbose, validate=validate)
|
|
240
|
-
sync.tree.build(sync.nodes)
|
|
241
251
|
|
|
242
|
-
for _ in range(
|
|
252
|
+
for _ in range(nthreads):
|
|
243
253
|
thread: Thread = Thread(
|
|
244
254
|
target=worker,
|
|
245
255
|
args=(
|
|
@@ -258,15 +268,15 @@ def multithreaded(
|
|
|
258
268
|
|
|
259
269
|
@timeit
|
|
260
270
|
def multiprocess(
|
|
261
|
-
tasks: Generator,
|
|
271
|
+
tasks: t.Generator,
|
|
262
272
|
doc: dict,
|
|
263
|
-
|
|
273
|
+
ncpus: t.Optional[int] = None,
|
|
264
274
|
verbose: bool = False,
|
|
265
275
|
validate: bool = False,
|
|
266
276
|
) -> None:
|
|
267
277
|
sys.stdout.write("Multiprocess\n")
|
|
268
278
|
task: Task = Task(doc, verbose=verbose, validate=validate)
|
|
269
|
-
with ProcessPoolExecutor(max_workers=
|
|
279
|
+
with ProcessPoolExecutor(max_workers=ncpus) as executor:
|
|
270
280
|
try:
|
|
271
281
|
list(executor.map(task.process, tasks))
|
|
272
282
|
except Exception as e:
|
|
@@ -276,14 +286,14 @@ def multiprocess(
|
|
|
276
286
|
|
|
277
287
|
@timeit
|
|
278
288
|
def multithreaded_async(
|
|
279
|
-
tasks: Generator,
|
|
289
|
+
tasks: t.Generator,
|
|
280
290
|
doc: dict,
|
|
281
|
-
|
|
291
|
+
nthreads: t.Optional[int] = None,
|
|
282
292
|
verbose: bool = False,
|
|
283
293
|
validate: bool = False,
|
|
284
294
|
) -> None:
|
|
285
295
|
sys.stdout.write("Multi-threaded async\n")
|
|
286
|
-
executor: ThreadPoolExecutor = ThreadPoolExecutor(max_workers=
|
|
296
|
+
executor: ThreadPoolExecutor = ThreadPoolExecutor(max_workers=nthreads)
|
|
287
297
|
event_loop = asyncio.get_event_loop()
|
|
288
298
|
event_loop.run_until_complete(
|
|
289
299
|
run_tasks(executor, tasks, doc, verbose=verbose, validate=validate)
|
|
@@ -293,14 +303,14 @@ def multithreaded_async(
|
|
|
293
303
|
|
|
294
304
|
@timeit
|
|
295
305
|
def multiprocess_async(
|
|
296
|
-
tasks: Generator,
|
|
306
|
+
tasks: t.Generator,
|
|
297
307
|
doc: dict,
|
|
298
|
-
|
|
308
|
+
ncpus: t.Optional[int] = None,
|
|
299
309
|
verbose: bool = False,
|
|
300
310
|
validate: bool = False,
|
|
301
311
|
) -> None:
|
|
302
312
|
sys.stdout.write("Multi-process async\n")
|
|
303
|
-
executor: ProcessPoolExecutor = ProcessPoolExecutor(max_workers=
|
|
313
|
+
executor: ProcessPoolExecutor = ProcessPoolExecutor(max_workers=ncpus)
|
|
304
314
|
event_loop = asyncio.get_event_loop()
|
|
305
315
|
try:
|
|
306
316
|
event_loop.run_until_complete(
|
|
@@ -312,18 +322,18 @@ def multiprocess_async(
|
|
|
312
322
|
|
|
313
323
|
|
|
314
324
|
async def run_tasks(
|
|
315
|
-
executor: Union[ThreadPoolExecutor, ProcessPoolExecutor],
|
|
316
|
-
tasks: Generator,
|
|
325
|
+
executor: t.Union[ThreadPoolExecutor, ProcessPoolExecutor],
|
|
326
|
+
tasks: t.Generator,
|
|
317
327
|
doc: dict,
|
|
318
328
|
verbose: bool = False,
|
|
319
329
|
validate: bool = False,
|
|
320
330
|
) -> None:
|
|
321
|
-
sync: Optional[Sync] = None
|
|
331
|
+
sync: t.Optional[Sync] = None
|
|
322
332
|
if isinstance(executor, ThreadPoolExecutor):
|
|
323
333
|
# threads can share a common Sync object
|
|
324
334
|
sync = Sync(doc, verbose=verbose, validate=validate)
|
|
325
335
|
event_loop = asyncio.get_event_loop()
|
|
326
|
-
completed,
|
|
336
|
+
completed, _ = await asyncio.wait(
|
|
327
337
|
[
|
|
328
338
|
event_loop.run_in_executor(
|
|
329
339
|
executor, run_task, task, sync, doc, verbose, validate
|
|
@@ -338,14 +348,13 @@ async def run_tasks(
|
|
|
338
348
|
|
|
339
349
|
def run_task(
|
|
340
350
|
task: dict,
|
|
341
|
-
sync: Optional[Sync] = None,
|
|
342
|
-
doc: Optional[dict] = None,
|
|
351
|
+
sync: t.Optional[Sync] = None,
|
|
352
|
+
doc: t.Optional[dict] = None,
|
|
343
353
|
verbose: bool = False,
|
|
344
354
|
validate: bool = False,
|
|
345
355
|
) -> int:
|
|
346
356
|
if sync is None:
|
|
347
357
|
sync: Sync = Sync(doc, verbose=verbose, validate=validate)
|
|
348
|
-
sync.tree.build(sync.nodes)
|
|
349
358
|
txmin: int = sync.checkpoint
|
|
350
359
|
txmax: int = sync.txid_current
|
|
351
360
|
sync.search_client.bulk(
|
|
@@ -355,10 +364,10 @@ def run_task(
|
|
|
355
364
|
if len(task) > 0:
|
|
356
365
|
page: int = max(task.keys())
|
|
357
366
|
row: int = max(task[page])
|
|
358
|
-
|
|
367
|
+
filename: str = re.sub(
|
|
359
368
|
"[^0-9a-zA-Z_]+", "", f"{sync.database.lower()}_{sync.index}"
|
|
360
369
|
)
|
|
361
|
-
save_ctid(page
|
|
370
|
+
save_ctid(page, row, filename)
|
|
362
371
|
|
|
363
372
|
return 1
|
|
364
373
|
|
|
@@ -410,20 +419,18 @@ def main(config, nprocs, mode, verbose):
|
|
|
410
419
|
show_settings()
|
|
411
420
|
config: str = get_config(config)
|
|
412
421
|
|
|
413
|
-
for
|
|
414
|
-
tasks: Generator = fetch_tasks(
|
|
422
|
+
for doc in config_loader(config):
|
|
423
|
+
tasks: t.Generator = fetch_tasks(doc)
|
|
415
424
|
if mode == "synchronous":
|
|
416
|
-
synchronous(tasks,
|
|
425
|
+
synchronous(tasks, doc, verbose=verbose)
|
|
417
426
|
elif mode == "multithreaded":
|
|
418
|
-
multithreaded(tasks,
|
|
427
|
+
multithreaded(tasks, doc, nthreads=nprocs, verbose=verbose)
|
|
419
428
|
elif mode == "multiprocess":
|
|
420
|
-
multiprocess(tasks,
|
|
429
|
+
multiprocess(tasks, doc, ncpus=nprocs, verbose=verbose)
|
|
421
430
|
elif mode == "multithreaded_async":
|
|
422
|
-
multithreaded_async(
|
|
423
|
-
tasks, document, nprocs=nprocs, verbose=verbose
|
|
424
|
-
)
|
|
431
|
+
multithreaded_async(tasks, doc, nthreads=nprocs, verbose=verbose)
|
|
425
432
|
elif mode == "multiprocess_async":
|
|
426
|
-
multiprocess_async(tasks,
|
|
433
|
+
multiprocess_async(tasks, doc, ncpus=nprocs, verbose=verbose)
|
|
427
434
|
|
|
428
435
|
|
|
429
436
|
if __name__ == "__main__":
|