linkarchivetools 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,263 @@
1
+ """
2
+ Converts Database to information about RSS feeds.
3
+
4
+ TODO - copy tags from origin to RSS feeds
5
+ """
6
+
7
+ import shutil
8
+ import argparse
9
+ from pathlib import Path
10
+ from sqlalchemy import create_engine
11
+
12
+ from webtoolkit import RemoteUrl, BaseUrl
13
+ from linkarchivetools import tableconfig
14
+ from .utils.reflected import *
15
+
16
+
17
+ class Db2Feeds(object):
18
+ """
19
+ Converter DB -> feeds.
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ input_db=None,
25
+ output_db=None,
26
+ verbose=True,
27
+ clean=False,
28
+ remote_server="",
29
+ output_format=None,
30
+ read_internet_links=False,
31
+ update_feed=False,
32
+ ):
33
+ """
34
+ Constructor
35
+ @param read_internet_links Read links to find RSS feeds
36
+ @param update_feed Many things are copied from original entry.
37
+ If this setting is true, feed entry fetches title, and other properties
38
+ """
39
+ self.input_db = input_db
40
+ self.output_db = output_db
41
+ self.verbose = verbose
42
+ self.clean = clean
43
+ self.remote_server = remote_server
44
+ self.output_format = output_format
45
+ self.read_internet_links = read_internet_links
46
+ self.update_feed = update_feed
47
+
48
+ self.new_table = None
49
+
50
+ if self.output_db:
51
+ self.output_format = "SQLITE"
52
+ self.make_output_db()
53
+
54
+ def make_output_db(self):
55
+ if self.output_format != "SQLITE":
56
+ self.new_engine = None
57
+ self.new_connection = None
58
+ return
59
+
60
+ new_path = Path(self.output_db)
61
+ if new_path.exists() and self.clean:
62
+ new_path.unlink()
63
+
64
+ if not new_path.exists():
65
+ shutil.copy(self.input_db, self.output_db)
66
+ self.new_engine = create_engine(f"sqlite:///{self.output_db}")
67
+ with self.new_engine.connect() as new_connection:
68
+ self.new_connection = new_connection
69
+ self.truncate_tables()
70
+
71
+ self.new_engine = create_engine(f"sqlite:///{self.output_db}")
72
+
73
+ def convert(self):
74
+ """
75
+ API
76
+ """
77
+ self.engine = create_engine(f"sqlite:///{self.input_db}")
78
+ with self.engine.connect() as connection:
79
+ self.connection = connection
80
+ if self.new_engine:
81
+ with self.new_engine.connect() as new_connection:
82
+ self.new_connection = new_connection
83
+ self.convert_entries()
84
+ else:
85
+ self.convert_entries()
86
+
87
+ def convert_entries(self):
88
+ if self.clean:
89
+ self.truncate_tables()
90
+
91
+ self.new_table = None
92
+ if self.new_engine:
93
+ self.new_table = ReflectedEntryTable(self.new_engine, self.new_connection)
94
+
95
+ table = ReflectedEntryTable(self.engine, self.connection)
96
+ for entry in table.get_entries_good():
97
+ self.convert_entry(entry)
98
+
99
+ def convert_entry(self, entry):
100
+ url = BaseUrl(entry.link)
101
+ feeds = url.get_feeds()
102
+
103
+ if len(feeds) == 0:
104
+ if self.read_internet_links:
105
+ if self.remote_server:
106
+ url_ex = RemoteUrl(remote_server_location=self.remote_server, url=entry.link)
107
+ url_ex.get_response()
108
+ feeds.extend(url_ex.get_feeds())
109
+
110
+ for feed in feeds:
111
+ data = self.prepare_data(entry, feed)
112
+
113
+ if self.new_table:
114
+ if not self.new_table.exists(link=feed):
115
+ self.print_data(entry, data)
116
+ self.copy_entry(entry, self.new_table, data)
117
+ else:
118
+ self.print_data(entry, data)
119
+
120
+ def prepare_data(self, entry, feed):
121
+ data = {}
122
+ data["link"] = feed
123
+ data["title"] = entry.title
124
+ data["page_rating_votes"] = entry.page_rating_votes
125
+ data["manual_status_code"] = entry.manual_status_code
126
+ data["thumbnail"] = entry.thumbnail
127
+ data["language"] = entry.language
128
+
129
+ # not null requirement
130
+ data["source_url"] = ""
131
+ data["permanent"] = False
132
+ data["bookmarked"] = False
133
+ data["status_code"] = entry.status_code
134
+ data["contents_type"] = 0
135
+ data["page_rating_contents"] = 0
136
+ data["page_rating_visits"] = 0
137
+ data["page_rating"] = 0
138
+
139
+ if self.update_feed and self.remote_server:
140
+ url_feed = RemoteUrl(remote_server=self.remote_server, url=feed)
141
+ url_feed.get_response()
142
+
143
+ data["title"] = url_feed.get_title()
144
+ data["description"] = url_feed.get_description()
145
+ data["status_code"] = url_feed.get_status_code()
146
+ data["thumbnail"] = url_feed.get_thumbnail()
147
+ return data
148
+
149
+ def copy_entry(self, entry, entry_table, data):
150
+ """
151
+ """
152
+ new_entry_id = entry_table.insert_json(data)
153
+ self.copy_tags(entry, new_entry_id)
154
+ self.copy_social_data(entry, new_entry_id)
155
+
156
+ def copy_tags(self, entry, new_entry_id):
157
+ source_entry_compacted_tags = ReflectedEntryCompactedTags(self.engine, self.connection)
158
+ tags = source_entry_compacted_tags.get_tags(entry.id)
159
+
160
+ entry_tag_data = {}
161
+ for tag in tags:
162
+ entry_tag_data["tag"] = tag
163
+ entry_tag_data["entry_id"] = new_entry_id
164
+ destination_entry_compacted_tags = ReflectedEntryCompactedTags(self.new_engine, self.new_connection)
165
+ destination_entry_compacted_tags.insert_json_data(entry_tag_data)
166
+
167
+ def copy_social_data(self, entry, new_entry_id):
168
+ source_entry_social_data = ReflectedSocialData(self.engine, self.connection)
169
+ social_data = source_entry_social_data.get_json(entry.id)
170
+ if social_data:
171
+ if "id" in social_data:
172
+ del social_data["id"]
173
+ social_data["entry_id"] = new_entry_id
174
+
175
+ destination_entry_social_data = ReflectedSocialData(self.new_engine, self.new_connection)
176
+ destination_entry_social_data.insert_json_data(social_data)
177
+
178
+ def truncate_tables(self):
179
+ if not self.new_engine:
180
+ return
181
+
182
+ table_names = self.get_table_names()
183
+ for table_name in table_names:
184
+ table = ReflectedTable(self.new_engine, self.new_connection)
185
+ table.truncate_table(table_name)
186
+
187
+ table = ReflectedTable(self.new_engine, self.new_connection)
188
+ table.vacuum()
189
+
190
+ def get_table_names(self):
191
+ return tableconfig.get_backup_tables()
192
+
193
+ def print_data(self, entry, data):
194
+ """
195
+ If we print to SQLITE we want to see progress so we display it anyway
196
+ """
197
+ if not self.verbose:
198
+ return
199
+
200
+ link = data["link"]
201
+ title = data["title"]
202
+ page_rating_votes = data["page_rating_votes"]
203
+
204
+ if self.output_format == "LINES" or self.output_format == "SQLITE":
205
+ print(f"[{page_rating_votes}] {link} - {title}")
206
+ user_tags = ReflectedEntryCompactedTags(self.engine, self.connection)
207
+ tags = user_tags.get_tags_string(entry.id)
208
+ if tags:
209
+ print(f"{tags}")
210
+ elif self.output_format == "JSON":
211
+ user_tags = ReflectedEntryCompactedTags(self.engine, self.connection)
212
+ tags = user_tags.get_tags(entry.id)
213
+ print(
214
+ f"""
215
+ \{ "title" : "{title}",
216
+ "link" : "{link}",
217
+ "page_rating_votes : {page_rating_votes},
218
+ "tags" : {tags}
219
+ \}"""
220
+ )
221
+ else:
222
+ print("Unsupported output format")
223
+
224
+
225
+ def parse():
226
+ parser = argparse.ArgumentParser(description="Data analyzer program")
227
+ parser.add_argument("--db", default="catalog.db", help="DB to be scanned")
228
+ parser.add_argument("--output-db", help="File to be created")
229
+ parser.add_argument("--update-rss",action="store_true", help="Reads RSS to check it's title and properties")
230
+ parser.add_argument("--clean",action="store_true", help="If output db exists, then it is removed at start")
231
+ parser.add_argument("--read-internet-links",action="store_true", help="Reads entries to check if contains RSS. Without it only calculated RSS are returned")
232
+ parser.add_argument(
233
+ "--output-format",
234
+ default="LINES",
235
+ help="format of display. LINES, JSON, SQLITE",
236
+ )
237
+ parser.add_argument("--crawling-server", default="", help="Remote crawling server")
238
+
239
+ args = parser.parse_args()
240
+
241
+ return parser, args
242
+
243
+
244
+ def main():
245
+ parser, args = parse()
246
+
247
+ path = Path(args.db)
248
+ if not path.exists():
249
+ print("File {} does not exist".format(path))
250
+ return
251
+
252
+ reader = Db2Feeds(
253
+ input_db=args.db,
254
+ output_db=args.output_db,
255
+ clean=args.clean,
256
+ remote_server=args.crawling_server,
257
+ output_format=args.output_format,
258
+ )
259
+ reader.convert()
260
+
261
+
262
+ if __name__ == "__main__":
263
+ main()
@@ -0,0 +1,188 @@
1
+ """
2
+ Converts database to JSON.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+ import json
8
+ import shutil
9
+ from pathlib import Path
10
+ import argparse
11
+
12
+ from sqlalchemy import create_engine
13
+ from .utils.reflected import *
14
+
15
+
16
+ class Db2JSON(object):
17
+
18
+ def __init__(self, input_db, output_dir, format=None, rows_max=1000):
19
+ self.input_db = input_db
20
+ self.output_dir = output_dir
21
+
22
+ self.format = format
23
+ self.rows_max = rows_max
24
+
25
+ self.file_index = 0
26
+ self.entry_index = 0
27
+ self.handle = None
28
+
29
+ self.rows = []
30
+
31
+ self.processed = 0
32
+ self.all = 0
33
+
34
+ self.setup()
35
+
36
+ def setup(self):
37
+ path = Path(self.input_db)
38
+ if not path.exists():
39
+ print("File {} does not exist".format(path))
40
+ return
41
+
42
+ if self.output_dir and self.output_dir != ".":
43
+ new_path = Path(self.output_dir)
44
+ if new_path.exists():
45
+ shutil.rmtree(new_path)
46
+ new_path.mkdir()
47
+
48
+ self.engine = create_engine(f"sqlite:///{self.input_db}")
49
+
50
+ def write(self, entry):
51
+ """Write entries to the specified directory, 1000 per file."""
52
+ if self.handle == None:
53
+ file_path = str(self.get_file_path())
54
+ self.handle = open(file_path, "w")
55
+
56
+ row = self.get_entry_json_data(entry)
57
+
58
+ self.rows.append(row)
59
+
60
+ self.entry_index += 1
61
+
62
+ sys.stdout.write(f"{self.file_index}/{self.entry_index:04d}\r")
63
+
64
+ if self.entry_index == self.rows_max:
65
+ self.file_index += 1
66
+ self.entry_index = 0
67
+ self.finish_stream()
68
+
69
+ file_path = str(self.get_file_path())
70
+ self.handle = open(file_path, "w")
71
+
72
+ def get_entry_json_data(self, entry):
73
+ date_published = entry.date_published
74
+ if date_published:
75
+ date_published = date_published.isoformat()
76
+
77
+ date_dead_since = entry.date_dead_since
78
+ if date_dead_since:
79
+ date_dead_since = date_dead_since.isoformat()
80
+
81
+ row = {
82
+ "link": entry.link,
83
+ "description": entry.description,
84
+ "author": entry.author,
85
+ "album": entry.album,
86
+ "bookmarked": entry.bookmarked,
87
+ "date_dead_since": date_dead_since,
88
+ "date_published": date_published,
89
+ "language": entry.language,
90
+ "manual_status_code": entry.manual_status_code,
91
+ "page_rating": entry.page_rating,
92
+ "page_rating_contents": entry.page_rating_contents,
93
+ "page_rating_votes": entry.page_rating_votes,
94
+ "page_rating_visits": entry.page_rating_visits,
95
+ "permanent": entry.permanent,
96
+ "source_url": entry.source_url,
97
+ "status_code": entry.status_code,
98
+ "thumbnail": entry.thumbnail,
99
+ "title": entry.title,
100
+ "age": entry.age,
101
+ "id": entry.id,
102
+ }
103
+
104
+ social_table = ReflectedSocialData(self.engine, self.connection)
105
+ social_data = social_table.get(entry.id)
106
+ if social_data:
107
+ row.setdefault("thumbs_up", social_data.thumbs_up)
108
+ row.setdefault("thumbs_down", social_data.thumbs_down)
109
+ row.setdefault("view_count", social_data.view_count)
110
+ row.setdefault("rating", social_data.rating)
111
+ row.setdefault("upvote_ratio", social_data.upvote_ratio)
112
+ row.setdefault("upvote_diff", social_data.upvote_diff)
113
+ row.setdefault("upvote_view_ratio", social_data.upvote_view_ratio)
114
+ row.setdefault("stars", social_data.stars)
115
+ row.setdefault("followers_count", social_data.followers_count)
116
+
117
+ tags_table = ReflectedUserTags(self.engine, self.connection)
118
+ tags = tags_table.get_tags(entry.id)
119
+ row["tags"] = tags
120
+
121
+ return row
122
+
123
+ def get_file_path(self):
124
+ filename = "{}_{}.json".format(self.format, str(self.file_index))
125
+ if self.output_dir and self.output_dir != ".":
126
+ return Path(self.output_dir) / filename
127
+ else:
128
+ return Path(filename)
129
+
130
+ def close(self):
131
+ if self.handle:
132
+ self.finish_stream()
133
+ self.handle = None
134
+
135
+ def finish_stream(self):
136
+ if not self.handle:
137
+ return
138
+
139
+ try:
140
+ string = json.dumps(self.rows, indent=4)
141
+ self.handle.write(string)
142
+ except ValueError as e:
143
+ print(f"Error writing file {file_path}: {e}")
144
+ self.handle.close()
145
+ self.handle = None
146
+ self.rows = []
147
+
148
+ def convert(self):
149
+ with self.engine.connect() as connection:
150
+ self.connection = connection
151
+ table = ReflectedEntryTable(self.engine, connection)
152
+
153
+ for entry in table.get_entries():
154
+ # print(entry)
155
+ self.write(entry)
156
+
157
+ self.close()
158
+
159
+
160
+ def parse():
161
+ parser = argparse.ArgumentParser(description="Data analyzer program")
162
+ parser.add_argument("--db", default="places.db", help="DB to be scanned")
163
+ parser.add_argument("--output-dir", default="json", help="Output directory")
164
+ parser.add_argument(
165
+ "--rows-max", default=1000, action="store_true", help="Number of rows per file"
166
+ )
167
+ parser.add_argument("-f", "--format", default="entries", help="file name format")
168
+ parser.add_argument("-v", "--verbosity", help="Verbosity level")
169
+
170
+ args = parser.parse_args()
171
+
172
+ return parser, args
173
+
174
+
175
+ def main():
176
+ parser, args = parse()
177
+
178
+ f = Db2JSON(
179
+ input_db=args.db,
180
+ output_dir=args.output_dir,
181
+ format=args.format,
182
+ rows_max=args.rows_max,
183
+ )
184
+ f.convert()
185
+
186
+
187
+ if __name__ == "__main__":
188
+ main()