linkarchivetools 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,237 @@
1
+ """
2
+ @brief Converts JSON files to SQLite DB
3
+
4
+ SQLite can easily be imported and used by other projects.
5
+ """
6
+
7
+ import os
8
+ import sqlite3
9
+ import json
10
+ import argparse
11
+ import time
12
+ from pathlib import Path
13
+ from sqlalchemy import create_engine
14
+ from dateutil import parser
15
+
16
+ from .utils.reflected import *
17
+
18
+
19
+ class DirReader(object):
20
+ def __init__(self, source_files_directory, accepted_extensions=None):
21
+ self.dir = source_files_directory
22
+ if accepted_extensions is None:
23
+ self.accepted_extensions = [".json"]
24
+
25
+ def get_files(self):
26
+ file_list = []
27
+ for root, dirs, files in os.walk(self.dir):
28
+ for file in files:
29
+ file_split = os.path.splitext(file)
30
+ if file_split[1] in self.accepted_extensions:
31
+ file_list.append(os.path.join(root, file))
32
+
33
+ file_list = sorted(file_list)
34
+ return file_list
35
+
36
+
37
+ class JSON2Db(object):
38
+ """
39
+ Performs actual conversion between from JSON to DB
40
+ """
41
+
42
+ def __init__(self, input_file=None, input_dir=None, output_db=None, preserve_id=False, vote_threshold=None, verbose=False):
43
+ self.input_file = input_file
44
+ self.input_dir = input_dir
45
+ self.output_db = output_db
46
+ self.preserve_id = preserve_id
47
+ self.vote_threshold = vote_threshold
48
+ self.verbose = verbose
49
+
50
+ if self.input_dir:
51
+ self.file_reader = DirReader(source_files_directory=self.input_dir)
52
+ self.files = self.file_reader.get_files()
53
+ elif self.input_file:
54
+ self.file_reader = None
55
+ self.files = [self.input_file]
56
+ else:
57
+ self.file_reader = None
58
+ self.files = []
59
+
60
+ def convert(self):
61
+ #path = Path(self.output_db)
62
+ #if path.exists():
63
+ # path.unlink()
64
+
65
+ self.engine = create_engine(f"sqlite:///{self.output_db}")
66
+ with self.engine.connect() as connection:
67
+ self.connection = connection
68
+
69
+ total_num_files = len(self.files)
70
+
71
+ for row, afile in enumerate(self.files):
72
+ print("[{}/{}]: file:{}".format(row, total_num_files, afile))
73
+ self.convert_file(afile)
74
+
75
+ def convert_file(self, file_name):
76
+ data = self.read_file(file_name)
77
+ if not data:
78
+ return
79
+
80
+ total_rows = len(data)
81
+
82
+ for row, entry in enumerate(data):
83
+ entry = self.prepare_entry(entry)
84
+ if "link" in entry:
85
+ if self.preserve_id:
86
+ if "id" not in entry:
87
+ print("Entry {} is missing ID".format(entry["link"]))
88
+ continue
89
+ else:
90
+ entry["id"] = row
91
+
92
+ if self.is_entry_to_be_added(entry):
93
+ table = ReflectedEntryTable(engine=self.engine, connection=self.connection)
94
+ if table.insert_json(entry) is not None:
95
+ if self.verbose:
96
+ print(
97
+ " -> [{}/{}] Link:{} Added".format(
98
+ row, total_rows, entry["link"]
99
+ )
100
+ )
101
+ else:
102
+ print(
103
+ " -> [{}/{}] Link:{} NOT Added".format(
104
+ row, total_rows, entry["link"]
105
+ )
106
+ )
107
+ else:
108
+ if self.verbose:
109
+ print(
110
+ " -> [{}/{}] Link:{} Skipped".format(
111
+ row, total_rows, entry["link"]
112
+ )
113
+ )
114
+
115
+ def prepare_entry(self, entry):
116
+ """
117
+ Drops any unwelcome keys
118
+ """
119
+ table = ReflectedEntryTable(engine=self.engine, connection=self.connection)
120
+ columns = table.get_column_names()
121
+ keys = list(entry.keys())
122
+
123
+ diff = list(set(keys) - set(columns))
124
+ for item in diff:
125
+ del entry[item]
126
+
127
+ for key in entry:
128
+ if key.startswith("date"):
129
+ if entry[key]:
130
+ entry[key] = parser.parse(entry[key])
131
+
132
+ return entry
133
+
134
+ def is_entry_to_be_added(self, entry):
135
+ # entry already exists
136
+ table = ReflectedEntryTable(engine=self.engine, connection=self.connection)
137
+ if "id" in entry and table.exists(id=entry["id"]):
138
+ return False
139
+ if "link" in entry and table.exists(link=entry["link"]):
140
+ return False
141
+
142
+ if self.vote_threshold:
143
+ if "page_rating_votes" in entry:
144
+ if entry["page_rating_votes"]:
145
+ if int(entry["page_rating_votes"]) < self.vote_threshold:
146
+ return False
147
+ else:
148
+ return True
149
+ return False
150
+ return False
151
+
152
+ return True
153
+
154
+ def read_file_contents(self, file_name):
155
+ with open(file_name, "r") as f:
156
+ return f.read()
157
+
158
+ def read_file(self, file_name):
159
+ text = self.read_file_contents(file_name)
160
+
161
+ try:
162
+ j = json.loads(text)
163
+
164
+ if "links" in j:
165
+ return j["links"]
166
+ if "sources" in j:
167
+ return j["sources"]
168
+
169
+ return j
170
+ except Exception as e:
171
+ print("Could not read file: {}".format(afile))
172
+
173
+
174
+ class Parser(object):
175
+ def parse(self):
176
+ self.parser = argparse.ArgumentParser(description="Data converter program")
177
+ self.parser.add_argument("--input-file", help="File to be scanned")
178
+ self.parser.add_argument("--input-dir", help="Directory to be scanned")
179
+ self.parser.add_argument(
180
+ "--output-db", default="converted.sqlite", help="Output db name"
181
+ )
182
+ self.parser.add_argument(
183
+ "--preserve-id", action="store_true", help="Preserves ID of objects"
184
+ )
185
+ self.parser.add_argument("--vote-min", help="Minimum amount of entry vote")
186
+ self.parser.add_argument("--language", help="Accept language") # TODO implement
187
+ self.parser.add_argument("--entries", help="Convert entries") # TODO implement
188
+ self.parser.add_argument("--sources", help="Convert sources") # TODO implement
189
+ self.parser.add_argument(
190
+ "--verbose", action="store_true", help="Shows more info"
191
+ )
192
+
193
+ self.args = self.parser.parse_args()
194
+
195
+ if self.args.dir:
196
+ self.dir = self.args.dir
197
+ else:
198
+ self.dir = None
199
+
200
+ if self.args.preserve_id:
201
+ self.preserve_id = self.args.preserve_id
202
+ else:
203
+ self.preserve_id = None
204
+
205
+ if self.args.vote_min:
206
+ self.vote_min = int(self.args.vote_min)
207
+ else:
208
+ self.vote_min = None
209
+
210
+
211
+ def main():
212
+ print("Starting processing")
213
+ parser = Parser()
214
+ parser.parse()
215
+
216
+ try:
217
+ start_time = time.time()
218
+
219
+ c = JSON2Db(input_file = parser.args.input_file, input_dir=parser.args.input_dir, output_db = parser.args.output_db)
220
+ c.convert()
221
+
222
+ elapsed_time_seconds = time.time() - start_time
223
+ elapsed_minutes = int(elapsed_time_seconds // 60)
224
+ elapsed_seconds = int(elapsed_time_seconds % 60)
225
+ print(f"Time: {elapsed_minutes}:{elapsed_seconds}")
226
+
227
+ except Exception as e:
228
+ print("Exception: {}".format(e))
229
+ except KeyboardInterrupt as e:
230
+ print("Exception: {}".format(e))
231
+
232
+ db.close()
233
+ print("Processing DONE")
234
+
235
+
236
+ if __name__ == "__main__":
237
+ main()
@@ -0,0 +1,66 @@
1
+ def get_tables():
2
+ table_names = [
3
+ "credentials",
4
+ "sourcecategories",
5
+ "sourcesubcategories",
6
+ "sourcedatamodel",
7
+ "userconfig",
8
+ "configurationentry",
9
+ "linkdatamodel",
10
+ "domains",
11
+ "usertags",
12
+ "compactedtags",
13
+ "usercompactedtags",
14
+ "entrycompactedtags",
15
+ "uservotes",
16
+ "browser",
17
+ "entryrules",
18
+ "dataexport",
19
+ "gateway",
20
+ "modelfiles",
21
+ "readlater",
22
+ "searchview",
23
+ "socialdata",
24
+ "blockentry",
25
+ "blockentrylist",
26
+ "usercomments",
27
+ "userbookmarks",
28
+ "usersearchhistory",
29
+ "userentrytransitionhistory",
30
+ "userentryvisithistory",
31
+ ]
32
+ return tables
33
+
34
+
35
+ def get_backup_tables():
36
+ tables = [
37
+ "credentials",
38
+ "sourcecategories",
39
+ "sourcesubcategories",
40
+ "sourcedatamodel",
41
+ "userconfig",
42
+ "configurationentry",
43
+ "linkdatamodel",
44
+ "domains",
45
+ "usertags",
46
+ "compactedtags",
47
+ "usercompactedtags",
48
+ "entrycompactedtags",
49
+ "uservotes",
50
+ "browser",
51
+ "entryrules",
52
+ "dataexport",
53
+ "gateway",
54
+ "modelfiles",
55
+ "readlater",
56
+ "searchview",
57
+ "socialdata",
58
+ # "blockentry", # do not backup it, the list has to be reinitialized each time
59
+ "blockentrylist",
60
+ "usercomments",
61
+ "userbookmarks",
62
+ "usersearchhistory",
63
+ "userentrytransitionhistory",
64
+ "userentryvisithistory",
65
+ ]
66
+ return tables
@@ -0,0 +1,177 @@
1
+ from sqlalchemy import and_, or_, not_, func, MetaData, Table, select
2
+
3
+ from .omnisearch import (
4
+ SingleSymbolEvaluator,
5
+ EquationEvaluator,
6
+ OmniSearch,
7
+ )
8
+
9
+
10
+ class AlchemySymbolEvaluator(SingleSymbolEvaluator):
11
+ """
12
+ return 1 if true
13
+ """
14
+
15
+ def __init__(self, table, ignore_case=False):
16
+ self.table = table
17
+ self.ignore_case = ignore_case
18
+
19
+ def evaluate_complex_symbol(self, symbol, condition_data):
20
+ # TODO make todo check if symbol exists in table?
21
+
22
+ if condition_data[1] == "==":
23
+ if self.ignore_case:
24
+ column = self.table.c[condition_data[0]]
25
+ return func.lower(column) == condition_data[2].lower()
26
+ else:
27
+ return self.table.c[condition_data[0]] == condition_data[2]
28
+
29
+ if condition_data[1] == "!=":
30
+ if self.ignore_case:
31
+ column = self.table.c[condition_data[0]]
32
+ return func.lower(column) != condition_data[2].lower()
33
+ else:
34
+ return self.table.c[condition_data[0]] != condition_data[2]
35
+
36
+ if condition_data[1] == ">":
37
+ return self.table.c[condition_data[0]] > condition_data[2]
38
+
39
+ if condition_data[1] == "<":
40
+ return self.table.c[condition_data[0]] < condition_data[2]
41
+
42
+ if condition_data[1] == ">=":
43
+ return self.table.c[condition_data[0]] >= condition_data[2]
44
+
45
+ if condition_data[1] == "<=":
46
+ return self.table.c[condition_data[0]] <= condition_data[2]
47
+
48
+ if condition_data[1] == "=":
49
+ symbol = condition_data[2]
50
+ symbol = symbol.replace("*", "%")
51
+
52
+ if self.ignore_case:
53
+ return self.table.c[condition_data[0]].ilike(symbol)
54
+ else:
55
+ return self.table.c[condition_data[0]].like(symbol)
56
+
57
+ raise IOError("Unsupported operator")
58
+
59
+ def evaluate_simple_symbol(self, symbol):
60
+ """
61
+ TODO we could check by default if entry link == symbol, or sth
62
+ """
63
+ if self.ignore_case:
64
+ symbol = symbol.replace("*", "%")
65
+ return or_(
66
+ self.table.c["link"].ilike(symbol),
67
+ self.table.c["title"].ilike(symbol),
68
+ self.table.c["description"].ilike(symbol),
69
+ )
70
+ else:
71
+ symbol = symbol.replace("*", "%")
72
+ return or_(
73
+ self.table.c["link"].like(symbol),
74
+ self.table.c["title"].like(symbol),
75
+ self.table.c["description"].like(symbol),
76
+ )
77
+
78
+
79
+ class AlchemyEquationEvaluator(EquationEvaluator):
80
+ def evaluate_function(self, operation_symbol, function, args0, args1):
81
+ if function == "And": # & sign
82
+ return and_(args0, args1)
83
+ elif function == "Or": # | sign
84
+ return or_(args0, args1)
85
+ elif function == "Not": # ~ sign
86
+ return not_(args0)
87
+ else:
88
+ raise NotImplementedError("Not implemented function: {}".format(function))
89
+
90
+
91
+ class AlchemyRowHandler(object):
92
+ def handle_row(self, row):
93
+ pass
94
+
95
+
96
+ class AlchemySearch(object):
97
+ def __init__(self, db, search_term, row_handler=None, args=None, connection=None):
98
+ self.db = db
99
+ self.connection = connection
100
+ self.search_term = search_term
101
+ self.alchemy_row_handler = row_handler
102
+
103
+ self.args = args
104
+
105
+ self.get_destination_table()
106
+
107
+ def search(self):
108
+ rows = self.get_filtered_objects()
109
+
110
+ for row in rows:
111
+ self.alchemy_row_handler.handle_row(row)
112
+ yield row
113
+
114
+ def get_destination_table(self):
115
+ destination_metadata = MetaData()
116
+
117
+ if self.args and self.args.table:
118
+ self.destination_table = Table(
119
+ self.args.table, destination_metadata, autoload_with=self.db
120
+ )
121
+ else:
122
+ self.destination_table = Table(
123
+ "linkdatamodel", destination_metadata, autoload_with=self.db
124
+ )
125
+
126
+ def get_query_conditions(self):
127
+ ignore_case = False
128
+ if self.args and self.args.ignore_case:
129
+ ignore_case = True
130
+
131
+ symbol_evaluator = AlchemySymbolEvaluator(self.destination_table, ignore_case)
132
+ equation_evaluator = AlchemyEquationEvaluator(
133
+ self.search_term, symbol_evaluator
134
+ )
135
+
136
+ search = OmniSearch(self.search_term, equation_evaluator=equation_evaluator)
137
+ combined_query_conditions = search.get_combined_query()
138
+ return combined_query_conditions
139
+
140
+ def get_filtered_objects(self):
141
+ combined_query_conditions = self.get_query_conditions()
142
+
143
+ rows = []
144
+
145
+ order_by_column_name = "id"
146
+ if self.args and self.args.order_by:
147
+ order_by_column_name = self.args.order_by
148
+
149
+ order_by_column = getattr(self.destination_table.c, order_by_column_name, None)
150
+
151
+ if order_by_column is None:
152
+ raise AttributeError(f"Invalid order_by column: {self.args.order_by}")
153
+
154
+ if self.args:
155
+ # Determine sorting order
156
+ order_by_clause = (
157
+ order_by_column.asc()
158
+ if self.args.asc
159
+ else order_by_column.desc() if self.args.desc else order_by_column.asc()
160
+ )
161
+ else:
162
+ order_by_clause = order_by_column.asc()
163
+
164
+ # Use select() for SQLAlchemy Core
165
+ stmt = (
166
+ select(self.destination_table)
167
+ .where(combined_query_conditions)
168
+ .order_by(order_by_clause)
169
+ )
170
+
171
+ # Execute the query
172
+ result = self.connection.execute(stmt)
173
+
174
+ # Fetch all results
175
+ rows = result.fetchall()
176
+
177
+ return rows