linkarchivetools 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- linkarchivetools/LICENSE +674 -0
- linkarchivetools/README.md +3 -0
- linkarchivetools/__init__.py +8 -0
- linkarchivetools/backup.py +764 -0
- linkarchivetools/db2feeds.py +263 -0
- linkarchivetools/db2json.py +188 -0
- linkarchivetools/dbanalyzer.py +356 -0
- linkarchivetools/dbfilter.py +154 -0
- linkarchivetools/dbmerge.py +82 -0
- linkarchivetools/json2db.py +237 -0
- linkarchivetools/tableconfig.py +66 -0
- linkarchivetools/utils/alchemysearch.py +177 -0
- linkarchivetools/utils/omnisearch.py +335 -0
- linkarchivetools/utils/reflected.py +501 -0
- linkarchivetools-0.1.10.dist-info/LICENSE +674 -0
- linkarchivetools-0.1.10.dist-info/METADATA +38 -0
- linkarchivetools-0.1.10.dist-info/RECORD +18 -0
- linkarchivetools-0.1.10.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Converts Database to information about RSS feeds.
|
|
3
|
+
|
|
4
|
+
TODO - copy tags from origin to RSS feeds
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import shutil
|
|
8
|
+
import argparse
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from sqlalchemy import create_engine
|
|
11
|
+
|
|
12
|
+
from webtoolkit import RemoteUrl, BaseUrl
|
|
13
|
+
from linkarchivetools import tableconfig
|
|
14
|
+
from .utils.reflected import *
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Db2Feeds(object):
|
|
18
|
+
"""
|
|
19
|
+
Converter DB -> feeds.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
input_db=None,
|
|
25
|
+
output_db=None,
|
|
26
|
+
verbose=True,
|
|
27
|
+
clean=False,
|
|
28
|
+
remote_server="",
|
|
29
|
+
output_format=None,
|
|
30
|
+
read_internet_links=False,
|
|
31
|
+
update_feed=False,
|
|
32
|
+
):
|
|
33
|
+
"""
|
|
34
|
+
Constructor
|
|
35
|
+
@param read_internet_links Read links to find RSS feeds
|
|
36
|
+
@param update_feed Many things are copied from original entry.
|
|
37
|
+
If this setting is true, feed entry fetches title, and other properties
|
|
38
|
+
"""
|
|
39
|
+
self.input_db = input_db
|
|
40
|
+
self.output_db = output_db
|
|
41
|
+
self.verbose = verbose
|
|
42
|
+
self.clean = clean
|
|
43
|
+
self.remote_server = remote_server
|
|
44
|
+
self.output_format = output_format
|
|
45
|
+
self.read_internet_links = read_internet_links
|
|
46
|
+
self.update_feed = update_feed
|
|
47
|
+
|
|
48
|
+
self.new_table = None
|
|
49
|
+
|
|
50
|
+
if self.output_db:
|
|
51
|
+
self.output_format = "SQLITE"
|
|
52
|
+
self.make_output_db()
|
|
53
|
+
|
|
54
|
+
def make_output_db(self):
|
|
55
|
+
if self.output_format != "SQLITE":
|
|
56
|
+
self.new_engine = None
|
|
57
|
+
self.new_connection = None
|
|
58
|
+
return
|
|
59
|
+
|
|
60
|
+
new_path = Path(self.output_db)
|
|
61
|
+
if new_path.exists() and self.clean:
|
|
62
|
+
new_path.unlink()
|
|
63
|
+
|
|
64
|
+
if not new_path.exists():
|
|
65
|
+
shutil.copy(self.input_db, self.output_db)
|
|
66
|
+
self.new_engine = create_engine(f"sqlite:///{self.output_db}")
|
|
67
|
+
with self.new_engine.connect() as new_connection:
|
|
68
|
+
self.new_connection = new_connection
|
|
69
|
+
self.truncate_tables()
|
|
70
|
+
|
|
71
|
+
self.new_engine = create_engine(f"sqlite:///{self.output_db}")
|
|
72
|
+
|
|
73
|
+
def convert(self):
|
|
74
|
+
"""
|
|
75
|
+
API
|
|
76
|
+
"""
|
|
77
|
+
self.engine = create_engine(f"sqlite:///{self.input_db}")
|
|
78
|
+
with self.engine.connect() as connection:
|
|
79
|
+
self.connection = connection
|
|
80
|
+
if self.new_engine:
|
|
81
|
+
with self.new_engine.connect() as new_connection:
|
|
82
|
+
self.new_connection = new_connection
|
|
83
|
+
self.convert_entries()
|
|
84
|
+
else:
|
|
85
|
+
self.convert_entries()
|
|
86
|
+
|
|
87
|
+
def convert_entries(self):
|
|
88
|
+
if self.clean:
|
|
89
|
+
self.truncate_tables()
|
|
90
|
+
|
|
91
|
+
self.new_table = None
|
|
92
|
+
if self.new_engine:
|
|
93
|
+
self.new_table = ReflectedEntryTable(self.new_engine, self.new_connection)
|
|
94
|
+
|
|
95
|
+
table = ReflectedEntryTable(self.engine, self.connection)
|
|
96
|
+
for entry in table.get_entries_good():
|
|
97
|
+
self.convert_entry(entry)
|
|
98
|
+
|
|
99
|
+
def convert_entry(self, entry):
|
|
100
|
+
url = BaseUrl(entry.link)
|
|
101
|
+
feeds = url.get_feeds()
|
|
102
|
+
|
|
103
|
+
if len(feeds) == 0:
|
|
104
|
+
if self.read_internet_links:
|
|
105
|
+
if self.remote_server:
|
|
106
|
+
url_ex = RemoteUrl(remote_server_location=self.remote_server, url=entry.link)
|
|
107
|
+
url_ex.get_response()
|
|
108
|
+
feeds.extend(url_ex.get_feeds())
|
|
109
|
+
|
|
110
|
+
for feed in feeds:
|
|
111
|
+
data = self.prepare_data(entry, feed)
|
|
112
|
+
|
|
113
|
+
if self.new_table:
|
|
114
|
+
if not self.new_table.exists(link=feed):
|
|
115
|
+
self.print_data(entry, data)
|
|
116
|
+
self.copy_entry(entry, self.new_table, data)
|
|
117
|
+
else:
|
|
118
|
+
self.print_data(entry, data)
|
|
119
|
+
|
|
120
|
+
def prepare_data(self, entry, feed):
|
|
121
|
+
data = {}
|
|
122
|
+
data["link"] = feed
|
|
123
|
+
data["title"] = entry.title
|
|
124
|
+
data["page_rating_votes"] = entry.page_rating_votes
|
|
125
|
+
data["manual_status_code"] = entry.manual_status_code
|
|
126
|
+
data["thumbnail"] = entry.thumbnail
|
|
127
|
+
data["language"] = entry.language
|
|
128
|
+
|
|
129
|
+
# not null requirement
|
|
130
|
+
data["source_url"] = ""
|
|
131
|
+
data["permanent"] = False
|
|
132
|
+
data["bookmarked"] = False
|
|
133
|
+
data["status_code"] = entry.status_code
|
|
134
|
+
data["contents_type"] = 0
|
|
135
|
+
data["page_rating_contents"] = 0
|
|
136
|
+
data["page_rating_visits"] = 0
|
|
137
|
+
data["page_rating"] = 0
|
|
138
|
+
|
|
139
|
+
if self.update_feed and self.remote_server:
|
|
140
|
+
url_feed = RemoteUrl(remote_server=self.remote_server, url=feed)
|
|
141
|
+
url_feed.get_response()
|
|
142
|
+
|
|
143
|
+
data["title"] = url_feed.get_title()
|
|
144
|
+
data["description"] = url_feed.get_description()
|
|
145
|
+
data["status_code"] = url_feed.get_status_code()
|
|
146
|
+
data["thumbnail"] = url_feed.get_thumbnail()
|
|
147
|
+
return data
|
|
148
|
+
|
|
149
|
+
def copy_entry(self, entry, entry_table, data):
|
|
150
|
+
"""
|
|
151
|
+
"""
|
|
152
|
+
new_entry_id = entry_table.insert_json(data)
|
|
153
|
+
self.copy_tags(entry, new_entry_id)
|
|
154
|
+
self.copy_social_data(entry, new_entry_id)
|
|
155
|
+
|
|
156
|
+
def copy_tags(self, entry, new_entry_id):
|
|
157
|
+
source_entry_compacted_tags = ReflectedEntryCompactedTags(self.engine, self.connection)
|
|
158
|
+
tags = source_entry_compacted_tags.get_tags(entry.id)
|
|
159
|
+
|
|
160
|
+
entry_tag_data = {}
|
|
161
|
+
for tag in tags:
|
|
162
|
+
entry_tag_data["tag"] = tag
|
|
163
|
+
entry_tag_data["entry_id"] = new_entry_id
|
|
164
|
+
destination_entry_compacted_tags = ReflectedEntryCompactedTags(self.new_engine, self.new_connection)
|
|
165
|
+
destination_entry_compacted_tags.insert_json_data(entry_tag_data)
|
|
166
|
+
|
|
167
|
+
def copy_social_data(self, entry, new_entry_id):
|
|
168
|
+
source_entry_social_data = ReflectedSocialData(self.engine, self.connection)
|
|
169
|
+
social_data = source_entry_social_data.get_json(entry.id)
|
|
170
|
+
if social_data:
|
|
171
|
+
if "id" in social_data:
|
|
172
|
+
del social_data["id"]
|
|
173
|
+
social_data["entry_id"] = new_entry_id
|
|
174
|
+
|
|
175
|
+
destination_entry_social_data = ReflectedSocialData(self.new_engine, self.new_connection)
|
|
176
|
+
destination_entry_social_data.insert_json_data(social_data)
|
|
177
|
+
|
|
178
|
+
def truncate_tables(self):
|
|
179
|
+
if not self.new_engine:
|
|
180
|
+
return
|
|
181
|
+
|
|
182
|
+
table_names = self.get_table_names()
|
|
183
|
+
for table_name in table_names:
|
|
184
|
+
table = ReflectedTable(self.new_engine, self.new_connection)
|
|
185
|
+
table.truncate_table(table_name)
|
|
186
|
+
|
|
187
|
+
table = ReflectedTable(self.new_engine, self.new_connection)
|
|
188
|
+
table.vacuum()
|
|
189
|
+
|
|
190
|
+
def get_table_names(self):
|
|
191
|
+
return tableconfig.get_backup_tables()
|
|
192
|
+
|
|
193
|
+
def print_data(self, entry, data):
|
|
194
|
+
"""
|
|
195
|
+
If we print to SQLITE we want to see progress so we display it anyway
|
|
196
|
+
"""
|
|
197
|
+
if not self.verbose:
|
|
198
|
+
return
|
|
199
|
+
|
|
200
|
+
link = data["link"]
|
|
201
|
+
title = data["title"]
|
|
202
|
+
page_rating_votes = data["page_rating_votes"]
|
|
203
|
+
|
|
204
|
+
if self.output_format == "LINES" or self.output_format == "SQLITE":
|
|
205
|
+
print(f"[{page_rating_votes}] {link} - {title}")
|
|
206
|
+
user_tags = ReflectedEntryCompactedTags(self.engine, self.connection)
|
|
207
|
+
tags = user_tags.get_tags_string(entry.id)
|
|
208
|
+
if tags:
|
|
209
|
+
print(f"{tags}")
|
|
210
|
+
elif self.output_format == "JSON":
|
|
211
|
+
user_tags = ReflectedEntryCompactedTags(self.engine, self.connection)
|
|
212
|
+
tags = user_tags.get_tags(entry.id)
|
|
213
|
+
print(
|
|
214
|
+
f"""
|
|
215
|
+
\{ "title" : "{title}",
|
|
216
|
+
"link" : "{link}",
|
|
217
|
+
"page_rating_votes : {page_rating_votes},
|
|
218
|
+
"tags" : {tags}
|
|
219
|
+
\}"""
|
|
220
|
+
)
|
|
221
|
+
else:
|
|
222
|
+
print("Unsupported output format")
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def parse():
|
|
226
|
+
parser = argparse.ArgumentParser(description="Data analyzer program")
|
|
227
|
+
parser.add_argument("--db", default="catalog.db", help="DB to be scanned")
|
|
228
|
+
parser.add_argument("--output-db", help="File to be created")
|
|
229
|
+
parser.add_argument("--update-rss",action="store_true", help="Reads RSS to check it's title and properties")
|
|
230
|
+
parser.add_argument("--clean",action="store_true", help="If output db exists, then it is removed at start")
|
|
231
|
+
parser.add_argument("--read-internet-links",action="store_true", help="Reads entries to check if contains RSS. Without it only calculated RSS are returned")
|
|
232
|
+
parser.add_argument(
|
|
233
|
+
"--output-format",
|
|
234
|
+
default="LINES",
|
|
235
|
+
help="format of display. LINES, JSON, SQLITE",
|
|
236
|
+
)
|
|
237
|
+
parser.add_argument("--crawling-server", default="", help="Remote crawling server")
|
|
238
|
+
|
|
239
|
+
args = parser.parse_args()
|
|
240
|
+
|
|
241
|
+
return parser, args
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def main():
|
|
245
|
+
parser, args = parse()
|
|
246
|
+
|
|
247
|
+
path = Path(args.db)
|
|
248
|
+
if not path.exists():
|
|
249
|
+
print("File {} does not exist".format(path))
|
|
250
|
+
return
|
|
251
|
+
|
|
252
|
+
reader = Db2Feeds(
|
|
253
|
+
input_db=args.db,
|
|
254
|
+
output_db=args.output_db,
|
|
255
|
+
clean=args.clean,
|
|
256
|
+
remote_server=args.crawling_server,
|
|
257
|
+
output_format=args.output_format,
|
|
258
|
+
)
|
|
259
|
+
reader.convert()
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
if __name__ == "__main__":
|
|
263
|
+
main()
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Converts database to JSON.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
import json
|
|
8
|
+
import shutil
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import argparse
|
|
11
|
+
|
|
12
|
+
from sqlalchemy import create_engine
|
|
13
|
+
from .utils.reflected import *
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Db2JSON(object):
|
|
17
|
+
|
|
18
|
+
def __init__(self, input_db, output_dir, format=None, rows_max=1000):
|
|
19
|
+
self.input_db = input_db
|
|
20
|
+
self.output_dir = output_dir
|
|
21
|
+
|
|
22
|
+
self.format = format
|
|
23
|
+
self.rows_max = rows_max
|
|
24
|
+
|
|
25
|
+
self.file_index = 0
|
|
26
|
+
self.entry_index = 0
|
|
27
|
+
self.handle = None
|
|
28
|
+
|
|
29
|
+
self.rows = []
|
|
30
|
+
|
|
31
|
+
self.processed = 0
|
|
32
|
+
self.all = 0
|
|
33
|
+
|
|
34
|
+
self.setup()
|
|
35
|
+
|
|
36
|
+
def setup(self):
|
|
37
|
+
path = Path(self.input_db)
|
|
38
|
+
if not path.exists():
|
|
39
|
+
print("File {} does not exist".format(path))
|
|
40
|
+
return
|
|
41
|
+
|
|
42
|
+
if self.output_dir and self.output_dir != ".":
|
|
43
|
+
new_path = Path(self.output_dir)
|
|
44
|
+
if new_path.exists():
|
|
45
|
+
shutil.rmtree(new_path)
|
|
46
|
+
new_path.mkdir()
|
|
47
|
+
|
|
48
|
+
self.engine = create_engine(f"sqlite:///{self.input_db}")
|
|
49
|
+
|
|
50
|
+
def write(self, entry):
|
|
51
|
+
"""Write entries to the specified directory, 1000 per file."""
|
|
52
|
+
if self.handle == None:
|
|
53
|
+
file_path = str(self.get_file_path())
|
|
54
|
+
self.handle = open(file_path, "w")
|
|
55
|
+
|
|
56
|
+
row = self.get_entry_json_data(entry)
|
|
57
|
+
|
|
58
|
+
self.rows.append(row)
|
|
59
|
+
|
|
60
|
+
self.entry_index += 1
|
|
61
|
+
|
|
62
|
+
sys.stdout.write(f"{self.file_index}/{self.entry_index:04d}\r")
|
|
63
|
+
|
|
64
|
+
if self.entry_index == self.rows_max:
|
|
65
|
+
self.file_index += 1
|
|
66
|
+
self.entry_index = 0
|
|
67
|
+
self.finish_stream()
|
|
68
|
+
|
|
69
|
+
file_path = str(self.get_file_path())
|
|
70
|
+
self.handle = open(file_path, "w")
|
|
71
|
+
|
|
72
|
+
def get_entry_json_data(self, entry):
|
|
73
|
+
date_published = entry.date_published
|
|
74
|
+
if date_published:
|
|
75
|
+
date_published = date_published.isoformat()
|
|
76
|
+
|
|
77
|
+
date_dead_since = entry.date_dead_since
|
|
78
|
+
if date_dead_since:
|
|
79
|
+
date_dead_since = date_dead_since.isoformat()
|
|
80
|
+
|
|
81
|
+
row = {
|
|
82
|
+
"link": entry.link,
|
|
83
|
+
"description": entry.description,
|
|
84
|
+
"author": entry.author,
|
|
85
|
+
"album": entry.album,
|
|
86
|
+
"bookmarked": entry.bookmarked,
|
|
87
|
+
"date_dead_since": date_dead_since,
|
|
88
|
+
"date_published": date_published,
|
|
89
|
+
"language": entry.language,
|
|
90
|
+
"manual_status_code": entry.manual_status_code,
|
|
91
|
+
"page_rating": entry.page_rating,
|
|
92
|
+
"page_rating_contents": entry.page_rating_contents,
|
|
93
|
+
"page_rating_votes": entry.page_rating_votes,
|
|
94
|
+
"page_rating_visits": entry.page_rating_visits,
|
|
95
|
+
"permanent": entry.permanent,
|
|
96
|
+
"source_url": entry.source_url,
|
|
97
|
+
"status_code": entry.status_code,
|
|
98
|
+
"thumbnail": entry.thumbnail,
|
|
99
|
+
"title": entry.title,
|
|
100
|
+
"age": entry.age,
|
|
101
|
+
"id": entry.id,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
social_table = ReflectedSocialData(self.engine, self.connection)
|
|
105
|
+
social_data = social_table.get(entry.id)
|
|
106
|
+
if social_data:
|
|
107
|
+
row.setdefault("thumbs_up", social_data.thumbs_up)
|
|
108
|
+
row.setdefault("thumbs_down", social_data.thumbs_down)
|
|
109
|
+
row.setdefault("view_count", social_data.view_count)
|
|
110
|
+
row.setdefault("rating", social_data.rating)
|
|
111
|
+
row.setdefault("upvote_ratio", social_data.upvote_ratio)
|
|
112
|
+
row.setdefault("upvote_diff", social_data.upvote_diff)
|
|
113
|
+
row.setdefault("upvote_view_ratio", social_data.upvote_view_ratio)
|
|
114
|
+
row.setdefault("stars", social_data.stars)
|
|
115
|
+
row.setdefault("followers_count", social_data.followers_count)
|
|
116
|
+
|
|
117
|
+
tags_table = ReflectedUserTags(self.engine, self.connection)
|
|
118
|
+
tags = tags_table.get_tags(entry.id)
|
|
119
|
+
row["tags"] = tags
|
|
120
|
+
|
|
121
|
+
return row
|
|
122
|
+
|
|
123
|
+
def get_file_path(self):
|
|
124
|
+
filename = "{}_{}.json".format(self.format, str(self.file_index))
|
|
125
|
+
if self.output_dir and self.output_dir != ".":
|
|
126
|
+
return Path(self.output_dir) / filename
|
|
127
|
+
else:
|
|
128
|
+
return Path(filename)
|
|
129
|
+
|
|
130
|
+
def close(self):
|
|
131
|
+
if self.handle:
|
|
132
|
+
self.finish_stream()
|
|
133
|
+
self.handle = None
|
|
134
|
+
|
|
135
|
+
def finish_stream(self):
|
|
136
|
+
if not self.handle:
|
|
137
|
+
return
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
string = json.dumps(self.rows, indent=4)
|
|
141
|
+
self.handle.write(string)
|
|
142
|
+
except ValueError as e:
|
|
143
|
+
print(f"Error writing file {file_path}: {e}")
|
|
144
|
+
self.handle.close()
|
|
145
|
+
self.handle = None
|
|
146
|
+
self.rows = []
|
|
147
|
+
|
|
148
|
+
def convert(self):
|
|
149
|
+
with self.engine.connect() as connection:
|
|
150
|
+
self.connection = connection
|
|
151
|
+
table = ReflectedEntryTable(self.engine, connection)
|
|
152
|
+
|
|
153
|
+
for entry in table.get_entries():
|
|
154
|
+
# print(entry)
|
|
155
|
+
self.write(entry)
|
|
156
|
+
|
|
157
|
+
self.close()
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def parse():
|
|
161
|
+
parser = argparse.ArgumentParser(description="Data analyzer program")
|
|
162
|
+
parser.add_argument("--db", default="places.db", help="DB to be scanned")
|
|
163
|
+
parser.add_argument("--output-dir", default="json", help="Output directory")
|
|
164
|
+
parser.add_argument(
|
|
165
|
+
"--rows-max", default=1000, action="store_true", help="Number of rows per file"
|
|
166
|
+
)
|
|
167
|
+
parser.add_argument("-f", "--format", default="entries", help="file name format")
|
|
168
|
+
parser.add_argument("-v", "--verbosity", help="Verbosity level")
|
|
169
|
+
|
|
170
|
+
args = parser.parse_args()
|
|
171
|
+
|
|
172
|
+
return parser, args
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def main():
|
|
176
|
+
parser, args = parse()
|
|
177
|
+
|
|
178
|
+
f = Db2JSON(
|
|
179
|
+
input_db=args.db,
|
|
180
|
+
output_dir=args.output_dir,
|
|
181
|
+
format=args.format,
|
|
182
|
+
rows_max=args.rows_max,
|
|
183
|
+
)
|
|
184
|
+
f.convert()
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
if __name__ == "__main__":
|
|
188
|
+
main()
|