linkarchivetools 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- linkarchivetools/LICENSE +674 -0
- linkarchivetools/README.md +3 -0
- linkarchivetools/__init__.py +8 -0
- linkarchivetools/backup.py +764 -0
- linkarchivetools/db2feeds.py +263 -0
- linkarchivetools/db2json.py +188 -0
- linkarchivetools/dbanalyzer.py +356 -0
- linkarchivetools/dbfilter.py +154 -0
- linkarchivetools/dbmerge.py +82 -0
- linkarchivetools/json2db.py +237 -0
- linkarchivetools/tableconfig.py +66 -0
- linkarchivetools/utils/alchemysearch.py +177 -0
- linkarchivetools/utils/omnisearch.py +335 -0
- linkarchivetools/utils/reflected.py +501 -0
- linkarchivetools-0.1.10.dist-info/LICENSE +674 -0
- linkarchivetools-0.1.10.dist-info/METADATA +38 -0
- linkarchivetools-0.1.10.dist-info/RECORD +18 -0
- linkarchivetools-0.1.10.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
"""
|
|
2
|
+
@brief Converts JSON files to SQLite DB
|
|
3
|
+
|
|
4
|
+
SQLite can easily be imported and used by other projects.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import sqlite3
|
|
9
|
+
import json
|
|
10
|
+
import argparse
|
|
11
|
+
import time
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from sqlalchemy import create_engine
|
|
14
|
+
from dateutil import parser
|
|
15
|
+
|
|
16
|
+
from .utils.reflected import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DirReader(object):
|
|
20
|
+
def __init__(self, source_files_directory, accepted_extensions=None):
|
|
21
|
+
self.dir = source_files_directory
|
|
22
|
+
if accepted_extensions is None:
|
|
23
|
+
self.accepted_extensions = [".json"]
|
|
24
|
+
|
|
25
|
+
def get_files(self):
|
|
26
|
+
file_list = []
|
|
27
|
+
for root, dirs, files in os.walk(self.dir):
|
|
28
|
+
for file in files:
|
|
29
|
+
file_split = os.path.splitext(file)
|
|
30
|
+
if file_split[1] in self.accepted_extensions:
|
|
31
|
+
file_list.append(os.path.join(root, file))
|
|
32
|
+
|
|
33
|
+
file_list = sorted(file_list)
|
|
34
|
+
return file_list
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class JSON2Db(object):
|
|
38
|
+
"""
|
|
39
|
+
Performs actual conversion between from JSON to DB
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, input_file=None, input_dir=None, output_db=None, preserve_id=False, vote_threshold=None, verbose=False):
|
|
43
|
+
self.input_file = input_file
|
|
44
|
+
self.input_dir = input_dir
|
|
45
|
+
self.output_db = output_db
|
|
46
|
+
self.preserve_id = preserve_id
|
|
47
|
+
self.vote_threshold = vote_threshold
|
|
48
|
+
self.verbose = verbose
|
|
49
|
+
|
|
50
|
+
if self.input_dir:
|
|
51
|
+
self.file_reader = DirReader(source_files_directory=self.input_dir)
|
|
52
|
+
self.files = self.file_reader.get_files()
|
|
53
|
+
elif self.input_file:
|
|
54
|
+
self.file_reader = None
|
|
55
|
+
self.files = [self.input_file]
|
|
56
|
+
else:
|
|
57
|
+
self.file_reader = None
|
|
58
|
+
self.files = []
|
|
59
|
+
|
|
60
|
+
def convert(self):
|
|
61
|
+
#path = Path(self.output_db)
|
|
62
|
+
#if path.exists():
|
|
63
|
+
# path.unlink()
|
|
64
|
+
|
|
65
|
+
self.engine = create_engine(f"sqlite:///{self.output_db}")
|
|
66
|
+
with self.engine.connect() as connection:
|
|
67
|
+
self.connection = connection
|
|
68
|
+
|
|
69
|
+
total_num_files = len(self.files)
|
|
70
|
+
|
|
71
|
+
for row, afile in enumerate(self.files):
|
|
72
|
+
print("[{}/{}]: file:{}".format(row, total_num_files, afile))
|
|
73
|
+
self.convert_file(afile)
|
|
74
|
+
|
|
75
|
+
def convert_file(self, file_name):
|
|
76
|
+
data = self.read_file(file_name)
|
|
77
|
+
if not data:
|
|
78
|
+
return
|
|
79
|
+
|
|
80
|
+
total_rows = len(data)
|
|
81
|
+
|
|
82
|
+
for row, entry in enumerate(data):
|
|
83
|
+
entry = self.prepare_entry(entry)
|
|
84
|
+
if "link" in entry:
|
|
85
|
+
if self.preserve_id:
|
|
86
|
+
if "id" not in entry:
|
|
87
|
+
print("Entry {} is missing ID".format(entry["link"]))
|
|
88
|
+
continue
|
|
89
|
+
else:
|
|
90
|
+
entry["id"] = row
|
|
91
|
+
|
|
92
|
+
if self.is_entry_to_be_added(entry):
|
|
93
|
+
table = ReflectedEntryTable(engine=self.engine, connection=self.connection)
|
|
94
|
+
if table.insert_json(entry) is not None:
|
|
95
|
+
if self.verbose:
|
|
96
|
+
print(
|
|
97
|
+
" -> [{}/{}] Link:{} Added".format(
|
|
98
|
+
row, total_rows, entry["link"]
|
|
99
|
+
)
|
|
100
|
+
)
|
|
101
|
+
else:
|
|
102
|
+
print(
|
|
103
|
+
" -> [{}/{}] Link:{} NOT Added".format(
|
|
104
|
+
row, total_rows, entry["link"]
|
|
105
|
+
)
|
|
106
|
+
)
|
|
107
|
+
else:
|
|
108
|
+
if self.verbose:
|
|
109
|
+
print(
|
|
110
|
+
" -> [{}/{}] Link:{} Skipped".format(
|
|
111
|
+
row, total_rows, entry["link"]
|
|
112
|
+
)
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
def prepare_entry(self, entry):
|
|
116
|
+
"""
|
|
117
|
+
Drops any unwelcome keys
|
|
118
|
+
"""
|
|
119
|
+
table = ReflectedEntryTable(engine=self.engine, connection=self.connection)
|
|
120
|
+
columns = table.get_column_names()
|
|
121
|
+
keys = list(entry.keys())
|
|
122
|
+
|
|
123
|
+
diff = list(set(keys) - set(columns))
|
|
124
|
+
for item in diff:
|
|
125
|
+
del entry[item]
|
|
126
|
+
|
|
127
|
+
for key in entry:
|
|
128
|
+
if key.startswith("date"):
|
|
129
|
+
if entry[key]:
|
|
130
|
+
entry[key] = parser.parse(entry[key])
|
|
131
|
+
|
|
132
|
+
return entry
|
|
133
|
+
|
|
134
|
+
def is_entry_to_be_added(self, entry):
|
|
135
|
+
# entry already exists
|
|
136
|
+
table = ReflectedEntryTable(engine=self.engine, connection=self.connection)
|
|
137
|
+
if "id" in entry and table.exists(id=entry["id"]):
|
|
138
|
+
return False
|
|
139
|
+
if "link" in entry and table.exists(link=entry["link"]):
|
|
140
|
+
return False
|
|
141
|
+
|
|
142
|
+
if self.vote_threshold:
|
|
143
|
+
if "page_rating_votes" in entry:
|
|
144
|
+
if entry["page_rating_votes"]:
|
|
145
|
+
if int(entry["page_rating_votes"]) < self.vote_threshold:
|
|
146
|
+
return False
|
|
147
|
+
else:
|
|
148
|
+
return True
|
|
149
|
+
return False
|
|
150
|
+
return False
|
|
151
|
+
|
|
152
|
+
return True
|
|
153
|
+
|
|
154
|
+
def read_file_contents(self, file_name):
|
|
155
|
+
with open(file_name, "r") as f:
|
|
156
|
+
return f.read()
|
|
157
|
+
|
|
158
|
+
def read_file(self, file_name):
|
|
159
|
+
text = self.read_file_contents(file_name)
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
j = json.loads(text)
|
|
163
|
+
|
|
164
|
+
if "links" in j:
|
|
165
|
+
return j["links"]
|
|
166
|
+
if "sources" in j:
|
|
167
|
+
return j["sources"]
|
|
168
|
+
|
|
169
|
+
return j
|
|
170
|
+
except Exception as e:
|
|
171
|
+
print("Could not read file: {}".format(afile))
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
class Parser(object):
|
|
175
|
+
def parse(self):
|
|
176
|
+
self.parser = argparse.ArgumentParser(description="Data converter program")
|
|
177
|
+
self.parser.add_argument("--input-file", help="File to be scanned")
|
|
178
|
+
self.parser.add_argument("--input-dir", help="Directory to be scanned")
|
|
179
|
+
self.parser.add_argument(
|
|
180
|
+
"--output-db", default="converted.sqlite", help="Output db name"
|
|
181
|
+
)
|
|
182
|
+
self.parser.add_argument(
|
|
183
|
+
"--preserve-id", action="store_true", help="Preserves ID of objects"
|
|
184
|
+
)
|
|
185
|
+
self.parser.add_argument("--vote-min", help="Minimum amount of entry vote")
|
|
186
|
+
self.parser.add_argument("--language", help="Accept language") # TODO implement
|
|
187
|
+
self.parser.add_argument("--entries", help="Convert entries") # TODO implement
|
|
188
|
+
self.parser.add_argument("--sources", help="Convert sources") # TODO implement
|
|
189
|
+
self.parser.add_argument(
|
|
190
|
+
"--verbose", action="store_true", help="Shows more info"
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
self.args = self.parser.parse_args()
|
|
194
|
+
|
|
195
|
+
if self.args.dir:
|
|
196
|
+
self.dir = self.args.dir
|
|
197
|
+
else:
|
|
198
|
+
self.dir = None
|
|
199
|
+
|
|
200
|
+
if self.args.preserve_id:
|
|
201
|
+
self.preserve_id = self.args.preserve_id
|
|
202
|
+
else:
|
|
203
|
+
self.preserve_id = None
|
|
204
|
+
|
|
205
|
+
if self.args.vote_min:
|
|
206
|
+
self.vote_min = int(self.args.vote_min)
|
|
207
|
+
else:
|
|
208
|
+
self.vote_min = None
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def main():
|
|
212
|
+
print("Starting processing")
|
|
213
|
+
parser = Parser()
|
|
214
|
+
parser.parse()
|
|
215
|
+
|
|
216
|
+
try:
|
|
217
|
+
start_time = time.time()
|
|
218
|
+
|
|
219
|
+
c = JSON2Db(input_file = parser.args.input_file, input_dir=parser.args.input_dir, output_db = parser.args.output_db)
|
|
220
|
+
c.convert()
|
|
221
|
+
|
|
222
|
+
elapsed_time_seconds = time.time() - start_time
|
|
223
|
+
elapsed_minutes = int(elapsed_time_seconds // 60)
|
|
224
|
+
elapsed_seconds = int(elapsed_time_seconds % 60)
|
|
225
|
+
print(f"Time: {elapsed_minutes}:{elapsed_seconds}")
|
|
226
|
+
|
|
227
|
+
except Exception as e:
|
|
228
|
+
print("Exception: {}".format(e))
|
|
229
|
+
except KeyboardInterrupt as e:
|
|
230
|
+
print("Exception: {}".format(e))
|
|
231
|
+
|
|
232
|
+
db.close()
|
|
233
|
+
print("Processing DONE")
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
if __name__ == "__main__":
|
|
237
|
+
main()
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
def get_tables():
|
|
2
|
+
table_names = [
|
|
3
|
+
"credentials",
|
|
4
|
+
"sourcecategories",
|
|
5
|
+
"sourcesubcategories",
|
|
6
|
+
"sourcedatamodel",
|
|
7
|
+
"userconfig",
|
|
8
|
+
"configurationentry",
|
|
9
|
+
"linkdatamodel",
|
|
10
|
+
"domains",
|
|
11
|
+
"usertags",
|
|
12
|
+
"compactedtags",
|
|
13
|
+
"usercompactedtags",
|
|
14
|
+
"entrycompactedtags",
|
|
15
|
+
"uservotes",
|
|
16
|
+
"browser",
|
|
17
|
+
"entryrules",
|
|
18
|
+
"dataexport",
|
|
19
|
+
"gateway",
|
|
20
|
+
"modelfiles",
|
|
21
|
+
"readlater",
|
|
22
|
+
"searchview",
|
|
23
|
+
"socialdata",
|
|
24
|
+
"blockentry",
|
|
25
|
+
"blockentrylist",
|
|
26
|
+
"usercomments",
|
|
27
|
+
"userbookmarks",
|
|
28
|
+
"usersearchhistory",
|
|
29
|
+
"userentrytransitionhistory",
|
|
30
|
+
"userentryvisithistory",
|
|
31
|
+
]
|
|
32
|
+
return tables
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_backup_tables():
|
|
36
|
+
tables = [
|
|
37
|
+
"credentials",
|
|
38
|
+
"sourcecategories",
|
|
39
|
+
"sourcesubcategories",
|
|
40
|
+
"sourcedatamodel",
|
|
41
|
+
"userconfig",
|
|
42
|
+
"configurationentry",
|
|
43
|
+
"linkdatamodel",
|
|
44
|
+
"domains",
|
|
45
|
+
"usertags",
|
|
46
|
+
"compactedtags",
|
|
47
|
+
"usercompactedtags",
|
|
48
|
+
"entrycompactedtags",
|
|
49
|
+
"uservotes",
|
|
50
|
+
"browser",
|
|
51
|
+
"entryrules",
|
|
52
|
+
"dataexport",
|
|
53
|
+
"gateway",
|
|
54
|
+
"modelfiles",
|
|
55
|
+
"readlater",
|
|
56
|
+
"searchview",
|
|
57
|
+
"socialdata",
|
|
58
|
+
# "blockentry", # do not backup it, the list has to be reinitialized each time
|
|
59
|
+
"blockentrylist",
|
|
60
|
+
"usercomments",
|
|
61
|
+
"userbookmarks",
|
|
62
|
+
"usersearchhistory",
|
|
63
|
+
"userentrytransitionhistory",
|
|
64
|
+
"userentryvisithistory",
|
|
65
|
+
]
|
|
66
|
+
return tables
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
from sqlalchemy import and_, or_, not_, func, MetaData, Table, select
|
|
2
|
+
|
|
3
|
+
from .omnisearch import (
|
|
4
|
+
SingleSymbolEvaluator,
|
|
5
|
+
EquationEvaluator,
|
|
6
|
+
OmniSearch,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AlchemySymbolEvaluator(SingleSymbolEvaluator):
|
|
11
|
+
"""
|
|
12
|
+
return 1 if true
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, table, ignore_case=False):
|
|
16
|
+
self.table = table
|
|
17
|
+
self.ignore_case = ignore_case
|
|
18
|
+
|
|
19
|
+
def evaluate_complex_symbol(self, symbol, condition_data):
|
|
20
|
+
# TODO make todo check if symbol exists in table?
|
|
21
|
+
|
|
22
|
+
if condition_data[1] == "==":
|
|
23
|
+
if self.ignore_case:
|
|
24
|
+
column = self.table.c[condition_data[0]]
|
|
25
|
+
return func.lower(column) == condition_data[2].lower()
|
|
26
|
+
else:
|
|
27
|
+
return self.table.c[condition_data[0]] == condition_data[2]
|
|
28
|
+
|
|
29
|
+
if condition_data[1] == "!=":
|
|
30
|
+
if self.ignore_case:
|
|
31
|
+
column = self.table.c[condition_data[0]]
|
|
32
|
+
return func.lower(column) != condition_data[2].lower()
|
|
33
|
+
else:
|
|
34
|
+
return self.table.c[condition_data[0]] != condition_data[2]
|
|
35
|
+
|
|
36
|
+
if condition_data[1] == ">":
|
|
37
|
+
return self.table.c[condition_data[0]] > condition_data[2]
|
|
38
|
+
|
|
39
|
+
if condition_data[1] == "<":
|
|
40
|
+
return self.table.c[condition_data[0]] < condition_data[2]
|
|
41
|
+
|
|
42
|
+
if condition_data[1] == ">=":
|
|
43
|
+
return self.table.c[condition_data[0]] >= condition_data[2]
|
|
44
|
+
|
|
45
|
+
if condition_data[1] == "<=":
|
|
46
|
+
return self.table.c[condition_data[0]] <= condition_data[2]
|
|
47
|
+
|
|
48
|
+
if condition_data[1] == "=":
|
|
49
|
+
symbol = condition_data[2]
|
|
50
|
+
symbol = symbol.replace("*", "%")
|
|
51
|
+
|
|
52
|
+
if self.ignore_case:
|
|
53
|
+
return self.table.c[condition_data[0]].ilike(symbol)
|
|
54
|
+
else:
|
|
55
|
+
return self.table.c[condition_data[0]].like(symbol)
|
|
56
|
+
|
|
57
|
+
raise IOError("Unsupported operator")
|
|
58
|
+
|
|
59
|
+
def evaluate_simple_symbol(self, symbol):
|
|
60
|
+
"""
|
|
61
|
+
TODO we could check by default if entry link == symbol, or sth
|
|
62
|
+
"""
|
|
63
|
+
if self.ignore_case:
|
|
64
|
+
symbol = symbol.replace("*", "%")
|
|
65
|
+
return or_(
|
|
66
|
+
self.table.c["link"].ilike(symbol),
|
|
67
|
+
self.table.c["title"].ilike(symbol),
|
|
68
|
+
self.table.c["description"].ilike(symbol),
|
|
69
|
+
)
|
|
70
|
+
else:
|
|
71
|
+
symbol = symbol.replace("*", "%")
|
|
72
|
+
return or_(
|
|
73
|
+
self.table.c["link"].like(symbol),
|
|
74
|
+
self.table.c["title"].like(symbol),
|
|
75
|
+
self.table.c["description"].like(symbol),
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class AlchemyEquationEvaluator(EquationEvaluator):
|
|
80
|
+
def evaluate_function(self, operation_symbol, function, args0, args1):
|
|
81
|
+
if function == "And": # & sign
|
|
82
|
+
return and_(args0, args1)
|
|
83
|
+
elif function == "Or": # | sign
|
|
84
|
+
return or_(args0, args1)
|
|
85
|
+
elif function == "Not": # ~ sign
|
|
86
|
+
return not_(args0)
|
|
87
|
+
else:
|
|
88
|
+
raise NotImplementedError("Not implemented function: {}".format(function))
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class AlchemyRowHandler(object):
|
|
92
|
+
def handle_row(self, row):
|
|
93
|
+
pass
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class AlchemySearch(object):
|
|
97
|
+
def __init__(self, db, search_term, row_handler=None, args=None, connection=None):
|
|
98
|
+
self.db = db
|
|
99
|
+
self.connection = connection
|
|
100
|
+
self.search_term = search_term
|
|
101
|
+
self.alchemy_row_handler = row_handler
|
|
102
|
+
|
|
103
|
+
self.args = args
|
|
104
|
+
|
|
105
|
+
self.get_destination_table()
|
|
106
|
+
|
|
107
|
+
def search(self):
|
|
108
|
+
rows = self.get_filtered_objects()
|
|
109
|
+
|
|
110
|
+
for row in rows:
|
|
111
|
+
self.alchemy_row_handler.handle_row(row)
|
|
112
|
+
yield row
|
|
113
|
+
|
|
114
|
+
def get_destination_table(self):
|
|
115
|
+
destination_metadata = MetaData()
|
|
116
|
+
|
|
117
|
+
if self.args and self.args.table:
|
|
118
|
+
self.destination_table = Table(
|
|
119
|
+
self.args.table, destination_metadata, autoload_with=self.db
|
|
120
|
+
)
|
|
121
|
+
else:
|
|
122
|
+
self.destination_table = Table(
|
|
123
|
+
"linkdatamodel", destination_metadata, autoload_with=self.db
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def get_query_conditions(self):
|
|
127
|
+
ignore_case = False
|
|
128
|
+
if self.args and self.args.ignore_case:
|
|
129
|
+
ignore_case = True
|
|
130
|
+
|
|
131
|
+
symbol_evaluator = AlchemySymbolEvaluator(self.destination_table, ignore_case)
|
|
132
|
+
equation_evaluator = AlchemyEquationEvaluator(
|
|
133
|
+
self.search_term, symbol_evaluator
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
search = OmniSearch(self.search_term, equation_evaluator=equation_evaluator)
|
|
137
|
+
combined_query_conditions = search.get_combined_query()
|
|
138
|
+
return combined_query_conditions
|
|
139
|
+
|
|
140
|
+
def get_filtered_objects(self):
|
|
141
|
+
combined_query_conditions = self.get_query_conditions()
|
|
142
|
+
|
|
143
|
+
rows = []
|
|
144
|
+
|
|
145
|
+
order_by_column_name = "id"
|
|
146
|
+
if self.args and self.args.order_by:
|
|
147
|
+
order_by_column_name = self.args.order_by
|
|
148
|
+
|
|
149
|
+
order_by_column = getattr(self.destination_table.c, order_by_column_name, None)
|
|
150
|
+
|
|
151
|
+
if order_by_column is None:
|
|
152
|
+
raise AttributeError(f"Invalid order_by column: {self.args.order_by}")
|
|
153
|
+
|
|
154
|
+
if self.args:
|
|
155
|
+
# Determine sorting order
|
|
156
|
+
order_by_clause = (
|
|
157
|
+
order_by_column.asc()
|
|
158
|
+
if self.args.asc
|
|
159
|
+
else order_by_column.desc() if self.args.desc else order_by_column.asc()
|
|
160
|
+
)
|
|
161
|
+
else:
|
|
162
|
+
order_by_clause = order_by_column.asc()
|
|
163
|
+
|
|
164
|
+
# Use select() for SQLAlchemy Core
|
|
165
|
+
stmt = (
|
|
166
|
+
select(self.destination_table)
|
|
167
|
+
.where(combined_query_conditions)
|
|
168
|
+
.order_by(order_by_clause)
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# Execute the query
|
|
172
|
+
result = self.connection.execute(stmt)
|
|
173
|
+
|
|
174
|
+
# Fetch all results
|
|
175
|
+
rows = result.fetchall()
|
|
176
|
+
|
|
177
|
+
return rows
|