linkarchivetools 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,501 @@
1
+ from sqlalchemy import (
2
+ MetaData,
3
+ Table,
4
+ select,
5
+ delete,
6
+ or_,
7
+ and_,
8
+ exists,
9
+ text,
10
+ inspect,
11
+ insert,
12
+ update,
13
+ Index,
14
+ )
15
+
16
+
17
+ class ReflectedTable(object):
18
+ def __init__(self, engine, connection):
19
+ self.engine = engine
20
+
21
+ def truncate_table(self, table_name):
22
+ sql_text = f"DELETE FROM {table_name};"
23
+
24
+ with self.engine.begin() as connection:
25
+ connection.execute(text(sql_text))
26
+
27
+ def create_index(self, table, column_name):
28
+ index_name = f"idx_{table.name}_{column_name}"
29
+ index = Index(index_name, getattr(table.c, column_name))
30
+
31
+ index.create(bind=self.engine)
32
+
33
+ def vacuum(self):
34
+ with self.engine.connect() as connection:
35
+ connection.execution_options(isolation_level="AUTOCOMMIT")
36
+ connection.execute(text("VACUUM"))
37
+
38
+ def close(self):
39
+ pass
40
+
41
+ def count(self, table_name):
42
+ sql_text = text(f"SELECT COUNT(*) FROM {table_name}")
43
+ with self.engine.connect() as connection:
44
+ row_count = connection.execute(sql_text).scalar_one()
45
+ return row_count
46
+
47
+ def print_summary(self, print_columns=False):
48
+ tables = self.get_table_names()
49
+
50
+ for table in tables:
51
+ row_count = self.count(table)
52
+ print(f"Table: {table}, Row count: {row_count}")
53
+
54
+ if print_columns:
55
+ column_names = self.get_column_names(table)
56
+ print(f"Columns in {table}: {', '.join(column_names)}")
57
+
58
+ def get_table_names(self):
59
+ inspector = inspect(self.engine)
60
+ tables = inspector.get_table_names()
61
+ return list(tables)
62
+
63
+ def get_column_names(self, table):
64
+ inspector = inspect(self.engine)
65
+
66
+ columns = inspector.get_columns(table)
67
+ column_names = [column["name"] for column in columns]
68
+ return column_names
69
+
70
+ def row_to_json_data(self, row):
71
+ data = dict(row._mapping)
72
+ return data
73
+
74
+ def run_sql(self, sql_text):
75
+ with self.engine.begin() as connection:
76
+ connection.execute(text(sql_text))
77
+
78
+
79
+ class ReflectedGenericTable(object):
80
+ def __init__(self, engine, connection, table_name=None):
81
+ self.engine = engine
82
+ self.table_name = table_name
83
+ if self.table_name is None:
84
+ self.table_name = self.get_table_name()
85
+ self.table = None
86
+
87
+ def get_table_name():
88
+ return self.table_name
89
+
90
+ def get_table(self):
91
+ if self.table is None:
92
+ destination_metadata = MetaData()
93
+ self.table = Table(
94
+ self.table_name, destination_metadata, autoload_with=self.engine
95
+ )
96
+ return self.table
97
+ return self.table
98
+
99
+ def truncate(self):
100
+ sql_text = f"DELETE FROM {self.table_name};"
101
+ with self.engine.begin() as connection:
102
+ result = connection.execute(text(sql_text))
103
+
104
+ def create_index(self, column_name):
105
+ index_name = f"idx_{self.table.name}_{column_name}"
106
+ index = Index(index_name, getattr(table.c, column_name))
107
+
108
+ index.create(bind=self.engine)
109
+
110
+ def insert_json_data(self, json_data: dict):
111
+ table = self.get_table()
112
+
113
+ stmt = (
114
+ insert(table)
115
+ .values(**json_data)
116
+ .returning(table.c.id)
117
+ )
118
+
119
+ with self.engine.begin() as connection:
120
+ result = connection.execute(stmt)
121
+ inserted_id = result.scalar_one()
122
+
123
+ return inserted_id
124
+
125
+ def update_json_data(self, id, json_data):
126
+ table = self.get_table()
127
+
128
+ stmt = (
129
+ update(table)
130
+ .where(table.c.id == id)
131
+ .values(**json_data)
132
+ )
133
+
134
+ with self.engine.begin() as connection:
135
+ connection.execute(stmt)
136
+
137
+ def count(self):
138
+ sql = text(f"SELECT COUNT(*) FROM {self.table_name}")
139
+ with self.engine.connect() as connection:
140
+ row_count = connection.execute(sql).scalar_one()
141
+ return row_count
142
+
143
+ def get(self, id):
144
+ table = self.get_table()
145
+ stmt = select(table).where(table.c.id == id)
146
+
147
+ with self.engine.connect() as connection:
148
+ result = connection.execute(stmt)
149
+ return result.first()
150
+
151
+ def get_where(self,
152
+ conditions_map: dict=None,
153
+ conditions=None,
154
+ order_by=None,
155
+ limit:int|None=None,
156
+ offset:int=0):
157
+ """
158
+ @param conditions_map can be passed as {"name": "Test"}
159
+ @param conditions can be passed as [destionation_table.c.rating > 5]
160
+ @param order_by can be passed as [destionation_table.c.name.asc()]
161
+ """
162
+ destination_table = self.get_table()
163
+
164
+ if not conditions:
165
+ conditions = []
166
+
167
+ if not conditions and conditions_map:
168
+ for column_name, value in conditions_map.items():
169
+ if not hasattr(destination_table.c, column_name):
170
+ raise ValueError(f"Unknown column: {column_name}")
171
+
172
+ conditions.append(getattr(destination_table.c, column_name) == value)
173
+
174
+ stmt = select(destination_table)
175
+
176
+ if conditions:
177
+ stmt = stmt.where(or_(*conditions))
178
+ if order_by:
179
+ stmt = stmt.order_by(*order_by)
180
+ if offset:
181
+ stmt = stmt.offset(offset)
182
+ if limit is not None:
183
+ stmt = stmt.limit(limit)
184
+
185
+ with self.engine.connect() as connection:
186
+ result = connection.execute(stmt)
187
+ rows = result.fetchall() # fetch all rows immediately
188
+
189
+ return rows
190
+
191
+ def delete(self, id):
192
+ table = self.get_table()
193
+ stmt = delete(table).where(table.c.id == id)
194
+
195
+ with self.engine.begin() as connection:
196
+ result = connection.execute(stmt)
197
+ rowcount = result.rowcount # number of rows deleted
198
+
199
+ return rowcount
200
+
201
+ def delete_where(self, conditions: dict):
202
+ table = self.get_table()
203
+
204
+ filters = []
205
+ for column_name, value in conditions.items():
206
+ if not hasattr(table.c, column_name):
207
+ raise ValueError(f"Unknown column: {column_name}")
208
+ filters.append(getattr(table.c, column_name) == value)
209
+
210
+ stmt = delete(table).where(and_(*filters))
211
+
212
+ with self.engine.begin() as connection:
213
+ result = connection.execute(stmt)
214
+ rowcount = result.rowcount # number of rows deleted
215
+
216
+ return rowcount
217
+
218
+ def print_summary(self, print_columns=False):
219
+ row_count = self.count()
220
+ print(f"Table: {self.table_name}, Row count: {row_count}")
221
+
222
+ if print_columns:
223
+ column_names = self.get_column_names()
224
+ print(f"Columns in {self.table_name}: {', '.join(column_names)}")
225
+
226
+ def get_column_names(self):
227
+ inspector = inspect(self.engine)
228
+
229
+ with self.engine.connect() as connection:
230
+ row_count = connection.execute(text(f"SELECT COUNT(*) FROM {self.table_name}")).scalar_one()
231
+
232
+ columns = inspector.get_columns(self.table_name)
233
+ column_names = [column["name"] for column in columns]
234
+ return column_names
235
+
236
+ def row_to_json_data(self, row):
237
+ """
238
+ Convert SQLAlchemy row to a dict
239
+ """
240
+ return dict(row._mapping)
241
+
242
+ def run_sql(self, sql_text):
243
+ with self.engine.begin() as connection:
244
+ connection.execute(text(sql_text))
245
+
246
+
247
+ class ReflectedEntryTable(ReflectedGenericTable):
248
+ def get_table_name(self):
249
+ return "linkdatamodel"
250
+
251
+ def insert_json(self, entry_json):
252
+ if "link" not in entry_json:
253
+ return
254
+
255
+ if "source_url" not in entry_json:
256
+ entry_json["source_url"] = ""
257
+ if "permanent" not in entry_json:
258
+ entry_json["permanent"] = False
259
+ if "bookmarked" not in entry_json:
260
+ entry_json["bookmarked"] = False
261
+ if "status_code" not in entry_json:
262
+ entry_json["status_code"] = 0
263
+ if "contents_type" not in entry_json:
264
+ entry_json["contents_type"] = 0
265
+ if "page_rating_contents" not in entry_json:
266
+ entry_json["page_rating_contents"] = 0
267
+ if "page_rating_visits" not in entry_json:
268
+ entry_json["page_rating_visits"] = 0
269
+ if "page_rating_votes" not in entry_json:
270
+ entry_json["page_rating_votes"] = 0
271
+ if "page_rating" not in entry_json:
272
+ entry_json["page_rating"] = 0
273
+
274
+ return self.insert_json_data(entry_json)
275
+
276
+ def get_entries(self, limit: int | None = None, offset: int = 0):
277
+ """
278
+ TODO remove use get_where
279
+ """
280
+ table = self.get_table()
281
+ stmt = select(table)
282
+
283
+ if offset:
284
+ stmt = stmt.offset(offset)
285
+ if limit is not None:
286
+ stmt = stmt.limit(limit)
287
+
288
+ with self.engine.connect() as connection:
289
+ result = connection.execute(stmt)
290
+ rows = result.fetchall() # fetch all rows immediately
291
+ return rows
292
+
293
+ def get_entries_good(self):
294
+ """
295
+ TODO remove use get_where
296
+ """
297
+ table = self.get_table()
298
+ stmt = (
299
+ select(table)
300
+ .where(table.c.page_rating_votes > 0)
301
+ .order_by(table.c.page_rating_votes.desc())
302
+ )
303
+
304
+ with self.engine.connect() as connection:
305
+ result = connection.execute(stmt)
306
+ rows = result.fetchall() # fetch all rows immediately
307
+ return rows
308
+
309
+ def exists(self, *, id=None, link=None):
310
+ table = self.get_table()
311
+
312
+ conditions = []
313
+ if id is not None:
314
+ conditions.append(table.c.id == id)
315
+ if link is not None:
316
+ conditions.append(table.c.link == link)
317
+
318
+ if not conditions:
319
+ return False
320
+
321
+ stmt = select(exists().where(or_(*conditions)))
322
+
323
+ with self.engine.connect() as connection:
324
+ return connection.execute(stmt).scalar()
325
+
326
+
327
+ class ReflectedUserTags(ReflectedGenericTable):
328
+ def get_table_name(self):
329
+ return "usertags"
330
+
331
+ def get_tags_string(self, entry_id):
332
+ table = self.get_table()
333
+ stmt = select(table).where(table.c.entry_id == entry_id)
334
+
335
+ tags_list = []
336
+
337
+ with self.engine.connect() as connection:
338
+ result = connection.execute(stmt)
339
+ for row in result:
340
+ tags_list.append(f"#{row.tag}")
341
+
342
+ return ", ".join(tags_list)
343
+
344
+ def get_tags(self, entry_id):
345
+ table = self.get_table()
346
+ stmt = select(table).where(table.c.entry_id == entry_id)
347
+
348
+ tags = []
349
+ with self.engine.connect() as connection:
350
+ result = connection.execute(stmt)
351
+ for row in result:
352
+ tags.append(row.tag)
353
+
354
+ return tags
355
+
356
+
357
+ class ReflectedEntryCompactedTags(ReflectedGenericTable):
358
+ def get_table_name(self):
359
+ return "entrycompactedtags"
360
+
361
+ def get_tags(self, entry_id):
362
+ """Return a list of tag strings for the given entry_id."""
363
+ table = self.get_table()
364
+ stmt = select(table).where(table.c.entry_id == entry_id)
365
+
366
+ tags = []
367
+ with self.engine.connect() as connection:
368
+ result = connection.execute(stmt)
369
+ for row in result:
370
+ tags.append(row.tag)
371
+
372
+ return tags
373
+
374
+
375
+ def get_tags_string(self, entry_id):
376
+ """Return tags for the given entry_id as a single string formatted as '#tag1, #tag2'."""
377
+ table = self.get_table()
378
+ stmt = select(table).where(table.c.entry_id == entry_id)
379
+
380
+ tags_list = []
381
+ with self.engine.connect() as connection:
382
+ result = connection.execute(stmt)
383
+ for row in result:
384
+ tags_list.append(f"#{row.tag}")
385
+
386
+ return ", ".join(tags_list)
387
+
388
+
389
+ class ReflectedSourceTable(ReflectedGenericTable):
390
+ def get_table_name(self):
391
+ return "sourcedatamodel"
392
+
393
+ def get_source(self, source_id):
394
+ """Return a single source row by ID, or None if not found."""
395
+ table = self.get_table()
396
+ stmt = select(table).where(table.c.id == source_id)
397
+
398
+ with self.engine.connect() as connection:
399
+ return connection.execute(stmt).first()
400
+
401
+ def get_sources(self, limit: int | None = None, offset: int = 0):
402
+ """Yield sources with optional offset and limit."""
403
+ table = self.get_table()
404
+ stmt = select(table)
405
+
406
+ if offset:
407
+ stmt = stmt.offset(offset)
408
+ if limit is not None:
409
+ stmt = stmt.limit(limit)
410
+
411
+ with self.engine.connect() as connection:
412
+ result = connection.execute(stmt)
413
+ sources = result.fetchall()
414
+ return sources
415
+
416
+ def insert_json(self, source_json: dict):
417
+ """Insert a source JSON dict, ensuring 'url' key exists."""
418
+ source_json.setdefault("url", "")
419
+ return self.insert_json_data(source_json)
420
+
421
+ def exists(self, *, id=None, url=None):
422
+ """Return True if a source with given ID or URL exists."""
423
+ table = self.get_table()
424
+
425
+ conditions = []
426
+ if id is not None:
427
+ conditions.append(table.c.id == id)
428
+ if url is not None:
429
+ conditions.append(table.c.url == url)
430
+
431
+ if not conditions:
432
+ return False
433
+
434
+ stmt = select(exists().where(or_(*conditions)))
435
+
436
+ with self.engine.connect() as connection:
437
+ return connection.execute(stmt).scalar()
438
+
439
+
440
+ class ReflectedSocialData(ReflectedGenericTable):
441
+ def get_table_name(self):
442
+ return "socialdata"
443
+
444
+ def get(self, entry_id):
445
+ """Return a single row matching entry_id, or None if not found."""
446
+ table = self.get_table()
447
+ stmt = select(table).where(table.c.entry_id == entry_id)
448
+
449
+ with self.engine.connect() as connection:
450
+ return connection.execute(stmt).first()
451
+
452
+
453
+ def get_json(self, entry_id):
454
+ """Return the row as a dict (JSON-style), or None if not found."""
455
+ row = self.get(entry_id)
456
+ if row is None:
457
+ return None
458
+ return self.row_to_json_data(row)
459
+
460
+
461
+ class EntryCopier(object):
462
+ def __init__(self, src_engine, src_connection, dst_engine, dst_connection):
463
+ self.src_engine = src_engine
464
+ self.src_connection = src_connection
465
+
466
+ self.dst_engine = dst_engine
467
+ self.dst_connection = dst_connection
468
+
469
+ def copy_entry(self, entry):
470
+ """
471
+ """
472
+ entry_table = ReflectedEntryTable(self.dst_engine, self.dst_connection)
473
+ data = entry_table.row_to_json_data(entry)
474
+ del data["id"]
475
+ new_entry_id = entry_table.insert_json(data)
476
+ if new_entry_id is not None:
477
+ self.copy_tags(entry, new_entry_id)
478
+ self.copy_social_data(entry, new_entry_id)
479
+ return new_entry_id
480
+
481
+ def copy_tags(self, entry, new_entry_id):
482
+ source_entry_compacted_tags = ReflectedEntryCompactedTags(self.src_engine, self.src_connection)
483
+ tags = source_entry_compacted_tags.get_tags(entry.id)
484
+
485
+ entry_tag_data = {}
486
+ for tag in tags:
487
+ entry_tag_data["tag"] = tag
488
+ entry_tag_data["entry_id"] = new_entry_id
489
+ destination_entry_compacted_tags = ReflectedEntryCompactedTags(self.dst_engine, self.dst_connection)
490
+ destination_entry_compacted_tags.insert_json_data(entry_tag_data)
491
+
492
+ def copy_social_data(self, entry, new_entry_id):
493
+ source_entry_social_data = ReflectedSocialData(self.src_engine, self.src_connection)
494
+ social_data = source_entry_social_data.get_json(entry.id)
495
+ if social_data:
496
+ if "id" in social_data:
497
+ del social_data["id"]
498
+ social_data["entry_id"] = new_entry_id
499
+
500
+ destination_entry_social_data = ReflectedSocialData(self.dst_engine, self.dst_connection)
501
+ destination_entry_social_data.insert_json_data(social_data)