pgsync 3.0.0__tar.gz → 3.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pgsync-3.0.0 → pgsync-3.1.0}/PKG-INFO +14 -19
- {pgsync-3.0.0 → pgsync-3.1.0}/bin/bootstrap +2 -2
- {pgsync-3.0.0 → pgsync-3.1.0}/bin/parallel_sync +72 -81
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync/__init__.py +1 -1
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync/base.py +174 -157
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync/constants.py +7 -4
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync/helper.py +6 -6
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync/node.py +46 -43
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync/plugin.py +4 -4
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync/querybuilder.py +28 -27
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync/redisqueue.py +4 -4
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync/search_client.py +94 -74
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync/settings.py +17 -5
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync/singleton.py +2 -2
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync/sync.py +108 -69
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync/transform.py +4 -4
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync/urls.py +17 -17
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync/utils.py +35 -26
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync/view.py +27 -25
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync.egg-info/PKG-INFO +14 -19
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync.egg-info/requires.txt +12 -18
- pgsync-3.1.0/pyproject.toml +3 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/setup.cfg +1 -1
- {pgsync-3.0.0 → pgsync-3.1.0}/setup.py +2 -1
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/conftest.py +133 -81
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/test_base.py +96 -73
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/test_node.py +24 -20
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/test_search_client.py +5 -9
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/test_settings.py +1 -1
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/test_sync.py +22 -24
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/test_sync_nested_children.py +39 -44
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/test_sync_root.py +35 -55
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/test_sync_single_child_fk_on_child.py +35 -47
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/test_sync_single_child_fk_on_parent.py +35 -47
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/test_trigger.py +3 -2
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/test_unique_behaviour.py +2 -10
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/test_utils.py +4 -4
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/test_view.py +83 -55
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/testing_utils.py +5 -3
- pgsync-3.0.0/pyproject.toml +0 -3
- {pgsync-3.0.0 → pgsync-3.1.0}/AUTHORS.rst +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/CONTRIBUTING.rst +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/HISTORY.rst +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/LICENSE +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/MANIFEST.in +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/README.md +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/README.rst +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/bin/pgsync +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/docs/Makefile +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/docs/authors.rst +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/docs/changelog.rst +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/docs/conf.py +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/docs/contributing.rst +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/docs/history.rst +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/docs/index.rst +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/docs/installation.rst +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/docs/logo.png +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/docs/make.bat +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/docs/readme.rst +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/docs/usage.rst +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync/exc.py +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync/trigger.py +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync.egg-info/SOURCES.txt +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync.egg-info/dependency_links.txt +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync.egg-info/not-zip-safe +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/pgsync.egg-info/top_level.txt +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/__init__.py +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/fixtures/schema.json +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/test_constants.py +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/test_env_vars.py +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/test_helper.py +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/test_log_handlers.py +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/test_query_builder.py +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/test_redisqueue.py +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/test_transform.py +0 -0
- {pgsync-3.0.0 → pgsync-3.1.0}/tests/test_urls.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pgsync
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.1.0
|
|
4
4
|
Summary: Postgres to Elasticsearch/OpenSearch sync
|
|
5
5
|
Home-page: https://github.com/toluaina/pgsync
|
|
6
6
|
Author: Tolu Aina
|
|
@@ -21,6 +21,7 @@ Classifier: Programming Language :: Python :: 3.8
|
|
|
21
21
|
Classifier: Programming Language :: Python :: 3.9
|
|
22
22
|
Classifier: Programming Language :: Python :: 3.10
|
|
23
23
|
Classifier: Programming Language :: Python :: 3.11
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
24
25
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
25
26
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
26
27
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -30,39 +31,33 @@ Description-Content-Type: text/markdown
|
|
|
30
31
|
License-File: LICENSE
|
|
31
32
|
License-File: AUTHORS.rst
|
|
32
33
|
Requires-Dist: async-timeout==4.0.3
|
|
33
|
-
Requires-Dist:
|
|
34
|
-
Requires-Dist:
|
|
35
|
-
Requires-Dist: botocore==1.32.3
|
|
34
|
+
Requires-Dist: boto3==1.34.11
|
|
35
|
+
Requires-Dist: botocore==1.34.11
|
|
36
36
|
Requires-Dist: certifi==2023.11.17
|
|
37
37
|
Requires-Dist: charset-normalizer==3.3.2
|
|
38
38
|
Requires-Dist: click==8.1.7
|
|
39
|
-
Requires-Dist:
|
|
40
|
-
Requires-Dist: elasticsearch
|
|
41
|
-
Requires-Dist:
|
|
42
|
-
Requires-Dist:
|
|
43
|
-
Requires-Dist: greenlet==3.0.
|
|
44
|
-
Requires-Dist: idna==3.
|
|
45
|
-
Requires-Dist: isort==5.12.0
|
|
39
|
+
Requires-Dist: elastic-transport==8.11.0
|
|
40
|
+
Requires-Dist: elasticsearch==8.11.1
|
|
41
|
+
Requires-Dist: elasticsearch-dsl==8.11.0
|
|
42
|
+
Requires-Dist: environs==10.0.0
|
|
43
|
+
Requires-Dist: greenlet==3.0.3
|
|
44
|
+
Requires-Dist: idna==3.6
|
|
46
45
|
Requires-Dist: jmespath==1.0.1
|
|
47
46
|
Requires-Dist: marshmallow==3.20.1
|
|
48
|
-
Requires-Dist: mypy-extensions==1.0.0
|
|
49
47
|
Requires-Dist: opensearch-dsl==2.1.0
|
|
50
|
-
Requires-Dist: opensearch-py==2.4.
|
|
48
|
+
Requires-Dist: opensearch-py==2.4.2
|
|
51
49
|
Requires-Dist: packaging==23.2
|
|
52
|
-
Requires-Dist: pathspec==0.11.2
|
|
53
|
-
Requires-Dist: platformdirs==4.0.0
|
|
54
50
|
Requires-Dist: psycopg2-binary==2.9.9
|
|
55
51
|
Requires-Dist: python-dateutil==2.8.2
|
|
56
52
|
Requires-Dist: python-dotenv==1.0.0
|
|
57
53
|
Requires-Dist: redis==5.0.1
|
|
58
54
|
Requires-Dist: requests==2.31.0
|
|
59
55
|
Requires-Dist: requests-aws4auth==1.2.3
|
|
60
|
-
Requires-Dist: s3transfer==0.
|
|
56
|
+
Requires-Dist: s3transfer==0.10.0
|
|
61
57
|
Requires-Dist: six==1.16.0
|
|
62
|
-
Requires-Dist: sqlalchemy==
|
|
58
|
+
Requires-Dist: sqlalchemy==2.0.25
|
|
63
59
|
Requires-Dist: sqlparse==0.4.4
|
|
64
|
-
Requires-Dist:
|
|
65
|
-
Requires-Dist: typing-extensions==4.8.0
|
|
60
|
+
Requires-Dist: typing-extensions==4.9.0
|
|
66
61
|
Requires-Dist: urllib3==1.26.18
|
|
67
62
|
|
|
68
63
|
# PostgreSQL to Elasticsearch/OpenSearch sync
|
|
@@ -56,9 +56,9 @@ def main(teardown, config, user, password, host, port, verbose):
|
|
|
56
56
|
|
|
57
57
|
validate: bool = False if teardown else True
|
|
58
58
|
|
|
59
|
-
for
|
|
59
|
+
for doc in config_loader(config):
|
|
60
60
|
sync: Sync = Sync(
|
|
61
|
-
|
|
61
|
+
doc,
|
|
62
62
|
verbose=verbose,
|
|
63
63
|
validate=validate,
|
|
64
64
|
repl_slots=False,
|
|
@@ -1,42 +1,42 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
|
|
3
3
|
"""
|
|
4
|
-
Parallel sync is an innovative, experimental feature designed to optimize
|
|
5
|
-
throughput by utilizing available CPUs/threads, particularly beneficial
|
|
4
|
+
Parallel sync is an innovative, experimental feature designed to optimize
|
|
5
|
+
throughput by utilizing available CPUs/threads, particularly beneficial
|
|
6
6
|
in environments experiencing high network latency.
|
|
7
7
|
|
|
8
8
|
Scenario & Challenge:
|
|
9
|
-
In instances where your PG database, Elasticsearch/OpenSearch, and PGSync
|
|
10
|
-
servers operate on divergent networks, a delay in request/response time is
|
|
11
|
-
noticeable. The primary constraint emerges from the database query's roundtrip,
|
|
12
|
-
which even server-side cursors can address only to a limited extent by fetching
|
|
13
|
-
a certain number of records at a time. The consequent delay in fetching the
|
|
9
|
+
In instances where your PG database, Elasticsearch/OpenSearch, and PGSync
|
|
10
|
+
servers operate on divergent networks, a delay in request/response time is
|
|
11
|
+
noticeable. The primary constraint emerges from the database query's roundtrip,
|
|
12
|
+
which even server-side cursors can address only to a limited extent by fetching
|
|
13
|
+
a certain number of records at a time. The consequent delay in fetching the
|
|
14
14
|
next cursor significantly hampers the overall synchronization speed.
|
|
15
15
|
|
|
16
16
|
Solution:
|
|
17
|
-
To mitigate this, the strategy is to conduct an initial fast/parallel sync,
|
|
18
|
-
thereby populating Elasticsearch/OpenSearch in a single iteration.
|
|
17
|
+
To mitigate this, the strategy is to conduct an initial fast/parallel sync,
|
|
18
|
+
thereby populating Elasticsearch/OpenSearch in a single iteration.
|
|
19
19
|
Post this, the regular pgsync can continue running as a daemon.
|
|
20
20
|
|
|
21
21
|
Approach and Technical Implementation:
|
|
22
|
-
The approach centers around utilizing the Tuple identifier record of the table
|
|
23
|
-
columns. Every table incorporates a system column – "ctid" of type "tid,"
|
|
22
|
+
The approach centers around utilizing the Tuple identifier record of the table
|
|
23
|
+
columns. Every table incorporates a system column – "ctid" of type "tid,"
|
|
24
24
|
which helps identify the page record and the row number in each block.
|
|
25
25
|
This element facilitates the pagination of the sync process.
|
|
26
26
|
|
|
27
|
-
Technically, pagination implies dividing each paged record amongst the
|
|
28
|
-
available CPUs/threads. This division enables the parallel execution of
|
|
29
|
-
Elasticsearch/OpenSearch bulk inserts. The "ctid" serves as a tuple
|
|
27
|
+
Technically, pagination implies dividing each paged record amongst the
|
|
28
|
+
available CPUs/threads. This division enables the parallel execution of
|
|
29
|
+
Elasticsearch/OpenSearch bulk inserts. The "ctid" serves as a tuple
|
|
30
30
|
(for instance, (1, 5)), pinpointing the row in a disk page.
|
|
31
31
|
|
|
32
|
-
By leveraging this method, all paged row records are retrieved upfront and
|
|
33
|
-
allocated as work units across the worker threads/CPUs.
|
|
34
|
-
Each work unit, defined by the BLOCK_SIZE, denotes the number of root node
|
|
32
|
+
By leveraging this method, all paged row records are retrieved upfront and
|
|
33
|
+
allocated as work units across the worker threads/CPUs.
|
|
34
|
+
Each work unit, defined by the BLOCK_SIZE, denotes the number of root node
|
|
35
35
|
records assigned for each worker to process.
|
|
36
36
|
|
|
37
|
-
Subsequently, the workers execute queries for each assigned chunk of work,
|
|
38
|
-
filtered based on the page number and row numbers.
|
|
39
|
-
This systematic and parallel approach optimizes the synchronization process,
|
|
37
|
+
Subsequently, the workers execute queries for each assigned chunk of work,
|
|
38
|
+
filtered based on the page number and row numbers.
|
|
39
|
+
This systematic and parallel approach optimizes the synchronization process,
|
|
40
40
|
especially in environments challenged by network latency.
|
|
41
41
|
"""
|
|
42
42
|
|
|
@@ -45,56 +45,50 @@ import multiprocessing
|
|
|
45
45
|
import os
|
|
46
46
|
import re
|
|
47
47
|
import sys
|
|
48
|
+
import typing as t
|
|
48
49
|
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
|
49
50
|
from dataclasses import dataclass
|
|
50
51
|
from queue import Queue
|
|
51
52
|
from threading import Thread
|
|
52
|
-
from typing import Generator, Optional, Union
|
|
53
53
|
|
|
54
54
|
import click
|
|
55
55
|
import sqlalchemy as sa
|
|
56
56
|
|
|
57
57
|
from pgsync.settings import BLOCK_SIZE, CHECKPOINT_PATH
|
|
58
58
|
from pgsync.sync import Sync
|
|
59
|
-
from pgsync.utils import
|
|
60
|
-
compiled_query,
|
|
61
|
-
config_loader,
|
|
62
|
-
get_config,
|
|
63
|
-
show_settings,
|
|
64
|
-
timeit,
|
|
65
|
-
)
|
|
59
|
+
from pgsync.utils import config_loader, get_config, show_settings, timeit
|
|
66
60
|
|
|
67
61
|
|
|
68
|
-
def save_ctid(page: int, row: int,
|
|
62
|
+
def save_ctid(page: int, row: int, filename: str) -> None:
|
|
69
63
|
"""
|
|
70
64
|
Save the checkpoint for a given page and row in a file with the given name.
|
|
71
65
|
|
|
72
66
|
Args:
|
|
73
67
|
page (int): The page number to save.
|
|
74
68
|
row (int): The row number to save.
|
|
75
|
-
|
|
69
|
+
filename (str): The name of the file to save the checkpoint in.
|
|
76
70
|
"""
|
|
77
|
-
|
|
78
|
-
with open(
|
|
71
|
+
filepath: str = os.path.join(CHECKPOINT_PATH, f".{filename}.ctid")
|
|
72
|
+
with open(filepath, "w+") as fp:
|
|
79
73
|
fp.write(f"{page},{row}\n")
|
|
80
74
|
|
|
81
75
|
|
|
82
|
-
def read_ctid(
|
|
76
|
+
def read_ctid(filename: str) -> t.Tuple[t.Optional[int], t.Optional[int]]:
|
|
83
77
|
"""
|
|
84
78
|
Reads the checkpoint file for the given name and returns the page and row numbers.
|
|
85
79
|
|
|
86
80
|
Args:
|
|
87
|
-
|
|
81
|
+
filename (str): The name of the checkpoint file.
|
|
88
82
|
|
|
89
83
|
Returns:
|
|
90
84
|
tuple: A tuple containing the page and row numbers. If the checkpoint file does not exist, returns (None, None).
|
|
91
85
|
"""
|
|
92
|
-
|
|
93
|
-
if os.path.exists(
|
|
94
|
-
with open(
|
|
86
|
+
filepath: str = os.path.join(CHECKPOINT_PATH, f".{filename}.ctid")
|
|
87
|
+
if os.path.exists(filepath):
|
|
88
|
+
with open(filepath, "r") as fp:
|
|
95
89
|
pairs: str = fp.read().split()[0].split(",")
|
|
96
|
-
page = int(pairs[0])
|
|
97
|
-
row = int(pairs[1])
|
|
90
|
+
page: int = int(pairs[0])
|
|
91
|
+
row: int = int(pairs[1])
|
|
98
92
|
return page, row
|
|
99
93
|
return None, None
|
|
100
94
|
|
|
@@ -120,7 +114,6 @@ class Task:
|
|
|
120
114
|
sync: Sync = Sync(
|
|
121
115
|
self.doc, verbose=self.verbose, validate=self.validate
|
|
122
116
|
)
|
|
123
|
-
sync.tree.build(sync.nodes)
|
|
124
117
|
txmin: int = sync.checkpoint
|
|
125
118
|
txmax: int = sync.txid_current
|
|
126
119
|
sync.search_client.bulk(
|
|
@@ -134,19 +127,19 @@ class Task:
|
|
|
134
127
|
@timeit
|
|
135
128
|
def fetch_tasks(
|
|
136
129
|
doc: dict,
|
|
137
|
-
block_size: Optional[int] = None,
|
|
138
|
-
) -> Generator:
|
|
130
|
+
block_size: t.Optional[int] = None,
|
|
131
|
+
) -> t.Generator:
|
|
139
132
|
block_size = block_size or BLOCK_SIZE
|
|
140
133
|
pages: dict = {}
|
|
141
134
|
sync: Sync = Sync(doc)
|
|
142
|
-
page: Optional[int] = None
|
|
143
|
-
row: Optional[int] = None
|
|
144
|
-
|
|
135
|
+
page: t.Optional[int] = None
|
|
136
|
+
row: t.Optional[int] = None
|
|
137
|
+
filename: str = re.sub(
|
|
145
138
|
"[^0-9a-zA-Z_]+", "", f"{sync.database.lower()}_{sync.index}"
|
|
146
139
|
)
|
|
147
|
-
page, row = read_ctid(
|
|
140
|
+
page, row = read_ctid(filename)
|
|
148
141
|
statement: sa.sql.Select = sa.select(
|
|
149
|
-
[
|
|
142
|
+
*[
|
|
150
143
|
sa.literal_column("1").label("x"),
|
|
151
144
|
sa.literal_column("1").label("y"),
|
|
152
145
|
sa.column("ctid"),
|
|
@@ -213,11 +206,13 @@ def fetch_tasks(
|
|
|
213
206
|
|
|
214
207
|
@timeit
|
|
215
208
|
def synchronous(
|
|
216
|
-
tasks: Generator,
|
|
209
|
+
tasks: t.Generator,
|
|
210
|
+
doc: dict,
|
|
211
|
+
verbose: bool = False,
|
|
212
|
+
validate: bool = False,
|
|
217
213
|
) -> None:
|
|
218
214
|
sys.stdout.write("Synchronous\n")
|
|
219
215
|
sync: Sync = Sync(doc, verbose=verbose, validate=validate)
|
|
220
|
-
sync.tree.build(sync.nodes)
|
|
221
216
|
txmin: int = sync.checkpoint
|
|
222
217
|
txmax: int = sync.txid_current
|
|
223
218
|
index: str = sync.index
|
|
@@ -231,9 +226,9 @@ def synchronous(
|
|
|
231
226
|
|
|
232
227
|
@timeit
|
|
233
228
|
def multithreaded(
|
|
234
|
-
tasks: Generator,
|
|
229
|
+
tasks: t.Generator,
|
|
235
230
|
doc: dict,
|
|
236
|
-
|
|
231
|
+
nthreads: t.Optional[int] = None,
|
|
237
232
|
verbose: bool = False,
|
|
238
233
|
validate: bool = False,
|
|
239
234
|
) -> None:
|
|
@@ -250,12 +245,11 @@ def multithreaded(
|
|
|
250
245
|
)
|
|
251
246
|
queue.task_done()
|
|
252
247
|
|
|
253
|
-
|
|
248
|
+
nthreads: int = nthreads or 1
|
|
254
249
|
queue: Queue = Queue()
|
|
255
250
|
sync: Sync = Sync(doc, verbose=verbose, validate=validate)
|
|
256
|
-
sync.tree.build(sync.nodes)
|
|
257
251
|
|
|
258
|
-
for _ in range(
|
|
252
|
+
for _ in range(nthreads):
|
|
259
253
|
thread: Thread = Thread(
|
|
260
254
|
target=worker,
|
|
261
255
|
args=(
|
|
@@ -274,15 +268,15 @@ def multithreaded(
|
|
|
274
268
|
|
|
275
269
|
@timeit
|
|
276
270
|
def multiprocess(
|
|
277
|
-
tasks: Generator,
|
|
271
|
+
tasks: t.Generator,
|
|
278
272
|
doc: dict,
|
|
279
|
-
|
|
273
|
+
ncpus: t.Optional[int] = None,
|
|
280
274
|
verbose: bool = False,
|
|
281
275
|
validate: bool = False,
|
|
282
276
|
) -> None:
|
|
283
277
|
sys.stdout.write("Multiprocess\n")
|
|
284
278
|
task: Task = Task(doc, verbose=verbose, validate=validate)
|
|
285
|
-
with ProcessPoolExecutor(max_workers=
|
|
279
|
+
with ProcessPoolExecutor(max_workers=ncpus) as executor:
|
|
286
280
|
try:
|
|
287
281
|
list(executor.map(task.process, tasks))
|
|
288
282
|
except Exception as e:
|
|
@@ -292,14 +286,14 @@ def multiprocess(
|
|
|
292
286
|
|
|
293
287
|
@timeit
|
|
294
288
|
def multithreaded_async(
|
|
295
|
-
tasks: Generator,
|
|
289
|
+
tasks: t.Generator,
|
|
296
290
|
doc: dict,
|
|
297
|
-
|
|
291
|
+
nthreads: t.Optional[int] = None,
|
|
298
292
|
verbose: bool = False,
|
|
299
293
|
validate: bool = False,
|
|
300
294
|
) -> None:
|
|
301
295
|
sys.stdout.write("Multi-threaded async\n")
|
|
302
|
-
executor: ThreadPoolExecutor = ThreadPoolExecutor(max_workers=
|
|
296
|
+
executor: ThreadPoolExecutor = ThreadPoolExecutor(max_workers=nthreads)
|
|
303
297
|
event_loop = asyncio.get_event_loop()
|
|
304
298
|
event_loop.run_until_complete(
|
|
305
299
|
run_tasks(executor, tasks, doc, verbose=verbose, validate=validate)
|
|
@@ -309,14 +303,14 @@ def multithreaded_async(
|
|
|
309
303
|
|
|
310
304
|
@timeit
|
|
311
305
|
def multiprocess_async(
|
|
312
|
-
tasks: Generator,
|
|
306
|
+
tasks: t.Generator,
|
|
313
307
|
doc: dict,
|
|
314
|
-
|
|
308
|
+
ncpus: t.Optional[int] = None,
|
|
315
309
|
verbose: bool = False,
|
|
316
310
|
validate: bool = False,
|
|
317
311
|
) -> None:
|
|
318
312
|
sys.stdout.write("Multi-process async\n")
|
|
319
|
-
executor: ProcessPoolExecutor = ProcessPoolExecutor(max_workers=
|
|
313
|
+
executor: ProcessPoolExecutor = ProcessPoolExecutor(max_workers=ncpus)
|
|
320
314
|
event_loop = asyncio.get_event_loop()
|
|
321
315
|
try:
|
|
322
316
|
event_loop.run_until_complete(
|
|
@@ -328,18 +322,18 @@ def multiprocess_async(
|
|
|
328
322
|
|
|
329
323
|
|
|
330
324
|
async def run_tasks(
|
|
331
|
-
executor: Union[ThreadPoolExecutor, ProcessPoolExecutor],
|
|
332
|
-
tasks: Generator,
|
|
325
|
+
executor: t.Union[ThreadPoolExecutor, ProcessPoolExecutor],
|
|
326
|
+
tasks: t.Generator,
|
|
333
327
|
doc: dict,
|
|
334
328
|
verbose: bool = False,
|
|
335
329
|
validate: bool = False,
|
|
336
330
|
) -> None:
|
|
337
|
-
sync: Optional[Sync] = None
|
|
331
|
+
sync: t.Optional[Sync] = None
|
|
338
332
|
if isinstance(executor, ThreadPoolExecutor):
|
|
339
333
|
# threads can share a common Sync object
|
|
340
334
|
sync = Sync(doc, verbose=verbose, validate=validate)
|
|
341
335
|
event_loop = asyncio.get_event_loop()
|
|
342
|
-
completed,
|
|
336
|
+
completed, _ = await asyncio.wait(
|
|
343
337
|
[
|
|
344
338
|
event_loop.run_in_executor(
|
|
345
339
|
executor, run_task, task, sync, doc, verbose, validate
|
|
@@ -354,14 +348,13 @@ async def run_tasks(
|
|
|
354
348
|
|
|
355
349
|
def run_task(
|
|
356
350
|
task: dict,
|
|
357
|
-
sync: Optional[Sync] = None,
|
|
358
|
-
doc: Optional[dict] = None,
|
|
351
|
+
sync: t.Optional[Sync] = None,
|
|
352
|
+
doc: t.Optional[dict] = None,
|
|
359
353
|
verbose: bool = False,
|
|
360
354
|
validate: bool = False,
|
|
361
355
|
) -> int:
|
|
362
356
|
if sync is None:
|
|
363
357
|
sync: Sync = Sync(doc, verbose=verbose, validate=validate)
|
|
364
|
-
sync.tree.build(sync.nodes)
|
|
365
358
|
txmin: int = sync.checkpoint
|
|
366
359
|
txmax: int = sync.txid_current
|
|
367
360
|
sync.search_client.bulk(
|
|
@@ -371,10 +364,10 @@ def run_task(
|
|
|
371
364
|
if len(task) > 0:
|
|
372
365
|
page: int = max(task.keys())
|
|
373
366
|
row: int = max(task[page])
|
|
374
|
-
|
|
367
|
+
filename: str = re.sub(
|
|
375
368
|
"[^0-9a-zA-Z_]+", "", f"{sync.database.lower()}_{sync.index}"
|
|
376
369
|
)
|
|
377
|
-
save_ctid(page
|
|
370
|
+
save_ctid(page, row, filename)
|
|
378
371
|
|
|
379
372
|
return 1
|
|
380
373
|
|
|
@@ -426,20 +419,18 @@ def main(config, nprocs, mode, verbose):
|
|
|
426
419
|
show_settings()
|
|
427
420
|
config: str = get_config(config)
|
|
428
421
|
|
|
429
|
-
for
|
|
430
|
-
tasks: Generator = fetch_tasks(
|
|
422
|
+
for doc in config_loader(config):
|
|
423
|
+
tasks: t.Generator = fetch_tasks(doc)
|
|
431
424
|
if mode == "synchronous":
|
|
432
|
-
synchronous(tasks,
|
|
425
|
+
synchronous(tasks, doc, verbose=verbose)
|
|
433
426
|
elif mode == "multithreaded":
|
|
434
|
-
multithreaded(tasks,
|
|
427
|
+
multithreaded(tasks, doc, nthreads=nprocs, verbose=verbose)
|
|
435
428
|
elif mode == "multiprocess":
|
|
436
|
-
multiprocess(tasks,
|
|
429
|
+
multiprocess(tasks, doc, ncpus=nprocs, verbose=verbose)
|
|
437
430
|
elif mode == "multithreaded_async":
|
|
438
|
-
multithreaded_async(
|
|
439
|
-
tasks, document, nprocs=nprocs, verbose=verbose
|
|
440
|
-
)
|
|
431
|
+
multithreaded_async(tasks, doc, nthreads=nprocs, verbose=verbose)
|
|
441
432
|
elif mode == "multiprocess_async":
|
|
442
|
-
multiprocess_async(tasks,
|
|
433
|
+
multiprocess_async(tasks, doc, ncpus=nprocs, verbose=verbose)
|
|
443
434
|
|
|
444
435
|
|
|
445
436
|
if __name__ == "__main__":
|