ohmyscrapper 0.2.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ohmyscrapper/__init__.py CHANGED
@@ -3,7 +3,7 @@ import argparse
3
3
  from ohmyscrapper.modules.classify_urls import classify_urls
4
4
  from ohmyscrapper.modules.sniff_url import sniff_url
5
5
  from ohmyscrapper.modules.load_txt import load_txt
6
- from ohmyscrapper.modules.seed import seed
6
+ from ohmyscrapper.modules.seed import seed, export_url_types_to_file
7
7
  from ohmyscrapper.modules.scrap_urls import scrap_urls
8
8
  from ohmyscrapper.modules.show import (
9
9
  show_url,
@@ -15,23 +15,40 @@ from ohmyscrapper.modules.show import (
15
15
  from ohmyscrapper.modules.untouch_all import untouch_all
16
16
  from ohmyscrapper.modules.process_with_ai import process_with_ai, reprocess_ai_history
17
17
  from ohmyscrapper.modules.merge_dbs import merge_dbs
18
+ from ohmyscrapper.core.config import update
18
19
 
19
20
 
20
21
  def main():
21
22
  parser = argparse.ArgumentParser(prog="ohmyscrapper")
22
- parser.add_argument("--version", action="version", version="%(prog)s v0.2.1")
23
+ parser.add_argument("--version", action="version", version="%(prog)s v0.6.1")
23
24
 
25
+ update()
24
26
  subparsers = parser.add_subparsers(dest="command", help="Available commands")
27
+ start_parser = subparsers.add_parser(
28
+ "start",
29
+ help="Make the entire process of 📦 loading, 🐶 scraping and 📜🖋️ exporting with the default configuration.",
30
+ )
25
31
 
26
- ai_process_parser = subparsers.add_parser(
27
- "process-with-ai", help="Process with AI."
32
+ start_parser.add_argument(
33
+ "--ai",
34
+ default=False,
35
+ help="Make the entire process of loading, processing, reprocessing with AI and exporting with the default configuration.",
36
+ action="store_true",
28
37
  )
38
+
39
+ ai_process_parser = subparsers.add_parser("ai", help="Process with AI.")
29
40
  ai_process_parser.add_argument(
30
41
  "--history", default=False, help="Reprocess ai history", action="store_true"
31
42
  )
32
43
 
33
44
  seed_parser = subparsers.add_parser(
34
- "seed", help="Seed database. Necessary to classify urls."
45
+ "seed", help="Seed database with `url_types` to classify the `urls`."
46
+ )
47
+ seed_parser.add_argument(
48
+ "--export",
49
+ default=False,
50
+ help="Add all `url_types` from the bank to the `/ohmyscrapper/url_types.yaml` file.",
51
+ action="store_true",
35
52
  )
36
53
  untouch_parser = subparsers.add_parser(
37
54
  "untouch-all", help="Untouch all urls. That resets classification"
@@ -44,12 +61,15 @@ def main():
44
61
  "--recursive", default=False, help="Run in recursive mode", action="store_true"
45
62
  )
46
63
 
47
- load_txt_parser = subparsers.add_parser("load", help="Load txt file")
64
+ load_txt_parser = subparsers.add_parser("load", help="📦 Load txt file")
48
65
  load_txt_parser.add_argument(
49
- "-file", default="input/_chat.txt", help="File path. Default is input/_chat.txt"
66
+ "-input", default=None, help="File/Folder path or url."
67
+ )
68
+ load_txt_parser.add_argument(
69
+ "--verbose", default=False, help="Run in verbose mode", action="store_true"
50
70
  )
51
71
 
52
- scrap_urls_parser = subparsers.add_parser("scrap-urls", help="Scrap urls")
72
+ scrap_urls_parser = subparsers.add_parser("scrap-urls", help="🐶 Scrap urls")
53
73
  scrap_urls_parser.add_argument(
54
74
  "--recursive", default=False, help="Run in recursive mode", action="store_true"
55
75
  )
@@ -62,8 +82,11 @@ def main():
62
82
  scrap_urls_parser.add_argument(
63
83
  "--only-parents", default=False, help="Only parents urls", action="store_true"
64
84
  )
85
+ scrap_urls_parser.add_argument(
86
+ "--verbose", default=False, help="Run in verbose mode", action="store_true"
87
+ )
65
88
 
66
- sniff_url_parser = subparsers.add_parser("sniff-url", help="Check url")
89
+ sniff_url_parser = subparsers.add_parser("sniff-url", help="🐕 Sniff/Check url")
67
90
  sniff_url_parser.add_argument(
68
91
  "url", default="https://cesarcardoso.cc/", help="Url to sniff"
69
92
  )
@@ -75,7 +98,7 @@ def main():
75
98
  show_urls_parser.add_argument("--limit", default=0, help="Limit of lines to show")
76
99
  show_urls_parser.add_argument("-url", default="", help="Url to show")
77
100
 
78
- export_parser = subparsers.add_parser("export", help="Export urls to csv.")
101
+ export_parser = subparsers.add_parser("export", help="📊🖋️ Export urls to csv.")
79
102
  export_parser.add_argument("--limit", default=0, help="Limit of lines to export")
80
103
  export_parser.add_argument(
81
104
  "--file",
@@ -89,14 +112,11 @@ def main():
89
112
  action="store_true",
90
113
  )
91
114
 
92
- report_parser = subparsers.add_parser("report", help="Export urls report to csv.")
115
+ report_parser = subparsers.add_parser(
116
+ "report", help="📜🖋️ Export urls report to csv."
117
+ )
93
118
  merge_parser = subparsers.add_parser("merge_dbs", help="Merge databases.")
94
119
 
95
- # TODO: What is that?
96
- # seed_parser.set_defaults(func=seed)
97
- # classify_urls_parser.set_defaults(func=classify_urls)
98
- # load_txt_parser.set_defaults(func=load_txt)
99
-
100
120
  args = parser.parse_args()
101
121
 
102
122
  if args.command == "classify-urls":
@@ -104,11 +124,14 @@ def main():
104
124
  return
105
125
 
106
126
  if args.command == "load":
107
- load_txt(args.file)
127
+ load_txt(file_name=args.input, verbose=args.verbose)
108
128
  return
109
129
 
110
130
  if args.command == "seed":
111
- seed()
131
+ if args.export:
132
+ export_url_types_to_file()
133
+ else:
134
+ seed()
112
135
  return
113
136
 
114
137
  if args.command == "untouch-all":
@@ -125,6 +148,7 @@ def main():
125
148
  ignore_valid_prefix=args.ignore_type,
126
149
  randomize=args.randomize,
127
150
  only_parents=args.only_parents,
151
+ verbose=args.verbose,
128
152
  )
129
153
  return
130
154
 
@@ -157,6 +181,21 @@ def main():
157
181
  merge_dbs()
158
182
  return
159
183
 
184
+ if args.command == "start":
185
+ load_txt()
186
+ scrap_urls(
187
+ recursive=True,
188
+ ignore_valid_prefix=True,
189
+ randomize=False,
190
+ only_parents=False,
191
+ )
192
+ if args.ai:
193
+ process_with_ai()
194
+ export_urls()
195
+ export_urls(csv_file="output/urls-simplified.csv", simplify=True)
196
+ export_report()
197
+ return
198
+
160
199
 
161
200
  if __name__ == "__main__":
162
201
  main()
@@ -0,0 +1,95 @@
1
+ import os
2
+ from ohmyscrapper.core import config_files
3
+
4
+ default_app_dir = "ohmyscrapper"
5
+
6
+
7
+ def get_dir(param="ohmyscrapper"):
8
+ parent_param = "default_dirs"
9
+
10
+ if param == default_app_dir:
11
+ folder = "./" + param
12
+ else:
13
+ folder = config_files.get_param(
14
+ parent_param=parent_param, param=param, default_app_dir=default_app_dir
15
+ )
16
+ if not os.path.exists(folder):
17
+ os.mkdir(folder)
18
+ return folder
19
+
20
+
21
+ def get_files(param):
22
+ parent_param = "default_files"
23
+ return config_files.get_param(
24
+ parent_param=parent_param, param=param, default_app_dir=default_app_dir
25
+ )
26
+
27
+
28
+ def get_db(param="db_file"):
29
+ if param == "folder":
30
+ return get_dir(param="db")
31
+ return config_files.get_param(
32
+ parent_param="db", param=param, default_app_dir=default_app_dir
33
+ )
34
+
35
+
36
+ def get_ai(param):
37
+ return config_files.get_param(
38
+ parent_param="ai", param=param, default_app_dir=default_app_dir
39
+ )
40
+
41
+
42
+ def load_config(force_default=False):
43
+ config_file_name = "config.yaml"
44
+ config_params = config_files.create_and_read_config_file(
45
+ file_name=config_file_name,
46
+ default_app_dir=default_app_dir,
47
+ force_default=force_default,
48
+ )
49
+
50
+ if config_params is None or "default_dirs" not in config_params:
51
+ config_params = load_config(force_default=True)
52
+
53
+ return config_params
54
+
55
+
56
+ def url_types_file_exists():
57
+ url_types_file = get_files("url_types")
58
+ return config_files.config_file_exists(
59
+ url_types_file, default_app_dir=default_app_dir
60
+ )
61
+
62
+
63
+ def get_url_types():
64
+ url_types_file = get_files("url_types")
65
+ return config_files.create_and_read_config_file(
66
+ url_types_file, default_app_dir=default_app_dir
67
+ )
68
+
69
+
70
+ def append_url_types(url_types):
71
+ url_types_file = get_files("url_types")
72
+ _append_config_file(url_types, url_types_file)
73
+
74
+
75
+ def overwrite_config_file(data, file_name):
76
+ config_files.overwrite_config_file(data, file_name, default_app_dir=default_app_dir)
77
+
78
+
79
+ def _append_config_file(data, file_name):
80
+ config_files.append_config_file(data, file_name, default_app_dir=default_app_dir)
81
+
82
+
83
+ def update():
84
+ legacy_folder = "./customize"
85
+ new_folder = "./ohmyscrapper"
86
+ if os.path.exists(legacy_folder) and not os.path.exists(new_folder):
87
+ yes_no = input(
88
+ "We detected a legacy folder system for your OhMyScrapper, would you like to update? \n"
89
+ "If you don't update, a new version will be used and your legacy folder will be ignored. \n"
90
+ "[Y] for yes or any other thing to ignore: "
91
+ )
92
+ if yes_no == "Y":
93
+ os.rename(legacy_folder, new_folder)
94
+ print(" You are up-to-date! =)")
95
+ print("")
@@ -0,0 +1,73 @@
1
+ import os
2
+ import yaml
3
+
4
+
5
+ def create_and_read_config_file(file_name, default_app_dir, force_default=False):
6
+ config_file = config_file_path(file_name, default_app_dir)
7
+ if force_default or not os.path.exists(config_file):
8
+ config_params = _get_default_file(default_file=file_name)
9
+ overwrite_config_file(
10
+ data=config_params, file_name=file_name, default_app_dir=default_app_dir
11
+ )
12
+ else:
13
+ with open(config_file, "r") as f:
14
+ config_params = yaml.safe_load(f.read())
15
+ if config_params is None:
16
+ config_params = create_and_read_config_file(
17
+ file_name=file_name, default_app_dir=default_app_dir, force_default=True
18
+ )
19
+ return config_params
20
+
21
+
22
+ def overwrite_config_file(data, file_name, default_app_dir):
23
+ config_file = config_file_path(file_name, default_app_dir)
24
+ with open(config_file, "+w") as f:
25
+ f.write(yaml.safe_dump(data))
26
+
27
+
28
+ def append_config_file(data, file_name, default_app_dir):
29
+ config_file = config_file_path(file_name, default_app_dir)
30
+ # append
31
+ with open(config_file, "+a") as f:
32
+ yaml.dump(data, f, allow_unicode=True)
33
+ # read
34
+ with open(config_file, "r") as f:
35
+ data = yaml.safe_load(f.read())
36
+ # overwrite preventing repetition
37
+ with open(config_file, "w") as f:
38
+ yaml.dump(data, f, allow_unicode=True)
39
+
40
+
41
+ def get_param(parent_param, param, default_app_dir):
42
+ default_dirs = create_and_read_config_file(
43
+ file_name="config.yaml", default_app_dir=default_app_dir
44
+ )[parent_param]
45
+
46
+ if param in default_dirs:
47
+ return default_dirs[param]
48
+ else:
49
+ raise Exception(f"{param} do not exist in your params {parent_param}.")
50
+
51
+
52
+ def config_file_exists(file_name, default_app_dir):
53
+ return os.path.exists(config_file_path(file_name, default_app_dir))
54
+
55
+
56
+ def config_file_path(file_name, default_app_dir):
57
+ _ensure_default_app_dir(default_app_dir)
58
+ config_file = os.path.join(default_app_dir, file_name)
59
+ return config_file
60
+
61
+
62
+ def _ensure_default_app_dir(default_app_dir):
63
+ if not os.path.exists(default_app_dir):
64
+ os.mkdir(default_app_dir)
65
+
66
+
67
+ def _get_default_file(default_file):
68
+ default_files_dir = os.path.join(
69
+ os.path.dirname(os.path.realpath(__file__)), "default_files"
70
+ )
71
+ default_file = os.path.join(default_files_dir, default_file)
72
+ with open(default_file, "r") as f:
73
+ return yaml.safe_load(f.read())
@@ -0,0 +1,15 @@
1
+ db:
2
+ db_file: local.db
3
+
4
+ default_dirs:
5
+ db: ./db
6
+ input: ./input
7
+ output: ./output
8
+ prompts: ./prompts
9
+ templates: ./templates
10
+
11
+ default_files:
12
+ url_types: url_types.yaml
13
+
14
+ ai:
15
+ default_prompt_file: prompt.md
@@ -0,0 +1,5 @@
1
+ linkedin_company: https://%.linkedin.com/company/%
2
+ linkedin_feed: https://%.linkedin.com/feed/%
3
+ linkedin_job: https://%.linkedin.com/jobs/view/%
4
+ linkedin_post: https://%.linkedin.com/posts/%
5
+ linkedin_redirect: https://lnkd.in/%
@@ -4,27 +4,37 @@ import time
4
4
  import glob
5
5
  import pandas as pd
6
6
  from urllib.parse import urlparse, urlunparse
7
+ from ohmyscrapper.core import config
7
8
 
8
9
 
9
10
  def get_db_dir():
10
- if not os.path.exists("db"):
11
- os.mkdir("db")
12
- return "db"
11
+ db_folder = config.get_dir("db")
12
+ if not os.path.exists(db_folder):
13
+ os.mkdir(db_folder)
14
+ return db_folder
13
15
 
14
16
 
15
17
  def get_db_path():
16
- return get_db_dir() + "/local.db"
18
+ db_file = config.get_db()
19
+ return os.path.join(get_db_dir(), db_file)
17
20
 
18
21
 
19
22
  def get_db_connection():
23
+ if not os.path.exists(get_db_path()):
24
+ create_tables(sqlite3.connect(get_db_path()))
20
25
  return sqlite3.connect(get_db_path())
21
26
 
22
27
 
23
- # TODO: check if it makes sense
24
- conn = get_db_connection()
28
+ def use_connection(func):
29
+ def provide_connection(*args, **kwargs):
30
+ global conn
31
+ with get_db_connection() as conn:
32
+ return func(*args, **kwargs)
25
33
 
34
+ return provide_connection
26
35
 
27
- def create_tables():
36
+
37
+ def create_tables(conn):
28
38
 
29
39
  c = conn.cursor()
30
40
  c.execute(
@@ -38,27 +48,17 @@ def create_tables():
38
48
  "CREATE TABLE IF NOT EXISTS urls_valid_prefix (id INTEGER PRIMARY KEY, url_prefix TEXT UNIQUE, url_type TEXT)"
39
49
  )
40
50
 
41
- return pd.read_sql_query("SELECT * FROM urls LIMIT 100", conn)
42
-
43
51
 
44
- # TODO: not sure this should be something. depends on the project
45
- def seeds():
46
- create_tables()
52
+ def seeds(seeds={}):
47
53
 
48
- add_urls_valid_prefix("https://%.linkedin.com/posts/%", "linkedin_post")
49
- add_urls_valid_prefix("https://lnkd.in/%", "linkedin_redirect")
50
- add_urls_valid_prefix("https://%.linkedin.com/jobs/view/%", "linkedin_job")
51
- add_urls_valid_prefix("https://%.linkedin.com/feed/%", "linkedin_feed")
52
- add_urls_valid_prefix("https://%.linkedin.com/company/%", "linkedin_company")
53
-
54
- # add_urls_valid_prefix("%.pdf", "pdf")
55
- # add_url('https://imazon.org.br/categorias/artigos-cientificos/')
54
+ for url_type, url_prefix in seeds.items():
55
+ add_urls_valid_prefix(url_prefix, url_type)
56
56
 
57
57
  return True
58
58
 
59
59
 
60
+ @use_connection
60
61
  def add_urls_valid_prefix(url_prefix, url_type):
61
- conn = get_db_connection()
62
62
 
63
63
  df = pd.read_sql_query(
64
64
  f"SELECT * FROM urls_valid_prefix WHERE url_prefix = '{url_prefix}'", conn
@@ -72,6 +72,7 @@ def add_urls_valid_prefix(url_prefix, url_type):
72
72
  conn.commit()
73
73
 
74
74
 
75
+ @use_connection
75
76
  def get_urls_valid_prefix_by_type(url_type):
76
77
  df = pd.read_sql_query(
77
78
  f"SELECT * FROM urls_valid_prefix WHERE url_type = '{url_type}'", conn
@@ -79,12 +80,14 @@ def get_urls_valid_prefix_by_type(url_type):
79
80
  return df
80
81
 
81
82
 
83
+ @use_connection
82
84
  def get_urls_valid_prefix_by_id(id):
83
85
  df = pd.read_sql_query(f"SELECT * FROM urls_valid_prefix WHERE id = '{id}'", conn)
84
86
  return df
85
87
 
86
88
 
87
89
  # TODO: pagination required
90
+ @use_connection
88
91
  def get_urls_valid_prefix(limit=0):
89
92
  if limit > 0:
90
93
  df = pd.read_sql_query(f"SELECT * FROM urls_valid_prefix LIMIT {limit}", conn)
@@ -94,6 +97,7 @@ def get_urls_valid_prefix(limit=0):
94
97
 
95
98
 
96
99
  # TODO: pagination required
100
+ @use_connection
97
101
  def get_urls(limit=0):
98
102
  if limit > 0:
99
103
  df = pd.read_sql_query(
@@ -104,6 +108,7 @@ def get_urls(limit=0):
104
108
  return df
105
109
 
106
110
 
111
+ @use_connection
107
112
  def get_urls_report():
108
113
  sql = """
109
114
  WITH parent_url AS (
@@ -138,6 +143,7 @@ def get_urls_report():
138
143
  return df
139
144
 
140
145
 
146
+ @use_connection
141
147
  def get_url_by_url(url):
142
148
  url = clean_url(url)
143
149
  df = pd.read_sql_query(f"SELECT * FROM urls WHERE url = '{url}'", conn)
@@ -145,12 +151,14 @@ def get_url_by_url(url):
145
151
  return df
146
152
 
147
153
 
154
+ @use_connection
148
155
  def get_url_by_id(id):
149
156
  df = pd.read_sql_query(f"SELECT * FROM urls WHERE id = '{id}'", conn)
150
157
 
151
158
  return df
152
159
 
153
160
 
161
+ @use_connection
154
162
  def get_urls_by_url_type(url_type):
155
163
  df = pd.read_sql_query(
156
164
  f"SELECT * FROM urls WHERE history = 0 AND url_type = '{url_type}'", conn
@@ -158,6 +166,7 @@ def get_urls_by_url_type(url_type):
158
166
  return df
159
167
 
160
168
 
169
+ @use_connection
161
170
  def get_urls_by_url_type_for_ai_process(url_type="linkedin_post", limit=10):
162
171
  df = pd.read_sql_query(
163
172
  f"SELECT * FROM urls WHERE history = 0 AND url_type = '{url_type}' AND ai_processed = 0 LIMIT {limit}",
@@ -166,6 +175,7 @@ def get_urls_by_url_type_for_ai_process(url_type="linkedin_post", limit=10):
166
175
  return df
167
176
 
168
177
 
178
+ @use_connection
169
179
  def get_url_like_unclassified(like_condition):
170
180
  df = pd.read_sql_query(
171
181
  f"SELECT * FROM urls WHERE history = 0 AND url LIKE '{like_condition}' AND url_type IS NULL",
@@ -174,6 +184,7 @@ def get_url_like_unclassified(like_condition):
174
184
  return df
175
185
 
176
186
 
187
+ @use_connection
177
188
  def add_url(url, h1=None, parent_url=None):
178
189
  url = clean_url(url)
179
190
  c = conn.cursor()
@@ -196,6 +207,7 @@ def add_url(url, h1=None, parent_url=None):
196
207
  return get_url_by_url(url)
197
208
 
198
209
 
210
+ @use_connection
199
211
  def add_ai_log(instructions, response, model, prompt_file, prompt_name):
200
212
  c = conn.cursor()
201
213
 
@@ -205,10 +217,14 @@ def add_ai_log(instructions, response, model, prompt_file, prompt_name):
205
217
  )
206
218
  conn.commit()
207
219
 
220
+
221
+ @use_connection
208
222
  def get_ai_log():
209
223
  df = pd.read_sql_query(f"SELECT * FROM ai_log", conn)
210
224
  return df
211
225
 
226
+
227
+ @use_connection
212
228
  def set_url_destiny(url, destiny):
213
229
  url = clean_url(url)
214
230
  destiny = clean_url(destiny)
@@ -222,6 +238,7 @@ def set_url_destiny(url, destiny):
222
238
  conn.commit()
223
239
 
224
240
 
241
+ @use_connection
225
242
  def set_url_h1(url, value):
226
243
  value = str(value).strip()
227
244
  url = clean_url(url)
@@ -230,6 +247,7 @@ def set_url_h1(url, value):
230
247
  conn.commit()
231
248
 
232
249
 
250
+ @use_connection
233
251
  def set_url_h1_by_id(id, value):
234
252
  value = str(value).strip()
235
253
 
@@ -238,29 +256,44 @@ def set_url_h1_by_id(id, value):
238
256
  conn.commit()
239
257
 
240
258
 
259
+ @use_connection
241
260
  def set_url_ai_processed_by_id(id, json_str):
242
261
  value = 1
243
262
  value = str(value).strip()
244
263
  c = conn.cursor()
245
- c.execute("UPDATE urls SET ai_processed = ? , json_ai = ? WHERE id = ?", (value, json_str, id))
264
+ c.execute(
265
+ "UPDATE urls SET ai_processed = ? , json_ai = ? WHERE id = ?",
266
+ (value, json_str, id),
267
+ )
246
268
  conn.commit()
247
269
 
270
+
271
+ @use_connection
248
272
  def set_url_empty_ai_processed_by_id(id, json_str="empty result"):
249
273
  value = 1
250
274
  value = str(value).strip()
251
275
  c = conn.cursor()
252
- c.execute("UPDATE urls SET ai_processed = ? , json_ai = ? WHERE ai_processed = 0 AND id = ?", (value, json_str, id))
276
+ c.execute(
277
+ "UPDATE urls SET ai_processed = ? , json_ai = ? WHERE ai_processed = 0 AND id = ?",
278
+ (value, json_str, id),
279
+ )
253
280
  conn.commit()
254
281
 
282
+
283
+ @use_connection
255
284
  def set_url_ai_processed_by_url(url, json_str):
256
285
  value = 1
257
286
  value = str(value).strip()
258
287
  url = clean_url(url)
259
288
  c = conn.cursor()
260
- c.execute("UPDATE urls SET ai_processed = ?, json_ai = ? WHERE url = ?", (value, json_str, url))
289
+ c.execute(
290
+ "UPDATE urls SET ai_processed = ?, json_ai = ? WHERE url = ?",
291
+ (value, json_str, url),
292
+ )
261
293
  conn.commit()
262
294
 
263
295
 
296
+ @use_connection
264
297
  def set_url_description(url, value):
265
298
  url = clean_url(url)
266
299
  c = conn.cursor()
@@ -268,6 +301,7 @@ def set_url_description(url, value):
268
301
  conn.commit()
269
302
 
270
303
 
304
+ @use_connection
271
305
  def set_url_description_links(url, value):
272
306
  url = clean_url(url)
273
307
  c = conn.cursor()
@@ -275,6 +309,7 @@ def set_url_description_links(url, value):
275
309
  conn.commit()
276
310
 
277
311
 
312
+ @use_connection
278
313
  def set_url_json(url, value):
279
314
  url = clean_url(url)
280
315
  c = conn.cursor()
@@ -282,6 +317,7 @@ def set_url_json(url, value):
282
317
  conn.commit()
283
318
 
284
319
 
320
+ @use_connection
285
321
  def set_url_error(url, value):
286
322
  url = clean_url(url)
287
323
  c = conn.cursor()
@@ -289,6 +325,7 @@ def set_url_error(url, value):
289
325
  conn.commit()
290
326
 
291
327
 
328
+ @use_connection
292
329
  def set_url_type_by_id(url_id, url_type):
293
330
  c = conn.cursor()
294
331
  c.execute(f"UPDATE urls SET url_type = '{url_type}' WHERE id = {url_id}")
@@ -312,6 +349,7 @@ def clean_url(url):
312
349
  return url
313
350
 
314
351
 
352
+ @use_connection
315
353
  def get_untouched_urls(
316
354
  limit=10, randomize=True, ignore_valid_prefix=False, only_parents=True
317
355
  ):
@@ -331,6 +369,7 @@ def get_untouched_urls(
331
369
  return df
332
370
 
333
371
 
372
+ @use_connection
334
373
  def touch_url(url):
335
374
  url = clean_url(url)
336
375
  c = conn.cursor()
@@ -338,6 +377,7 @@ def touch_url(url):
338
377
  conn.commit()
339
378
 
340
379
 
380
+ @use_connection
341
381
  def untouch_url(url):
342
382
  url = clean_url(url)
343
383
  c = conn.cursor()
@@ -345,12 +385,14 @@ def untouch_url(url):
345
385
  conn.commit()
346
386
 
347
387
 
388
+ @use_connection
348
389
  def untouch_all_urls():
349
390
  c = conn.cursor()
350
391
  c.execute("UPDATE urls SET last_touch = NULL WHERE history = 0")
351
392
  conn.commit()
352
393
 
353
394
 
395
+ @use_connection
354
396
  def set_all_urls_as_history():
355
397
  c = conn.cursor()
356
398
  c.execute("UPDATE urls SET history = 1")
@@ -382,9 +424,9 @@ def merge_dbs() -> None:
382
424
  row["description"],
383
425
  row["json"],
384
426
  )
385
- # ßmerge_url(df)
386
427
 
387
428
 
429
+ @use_connection
388
430
  def merge_url(url, h1, last_touch, created_at, description, json):
389
431
  url = clean_url(url)
390
432
  c = conn.cursor()
@@ -9,15 +9,19 @@ def classify_urls(recursive=False):
9
9
 
10
10
  keep_alive = True
11
11
  while keep_alive:
12
- print("waking up!")
12
+ print("#️⃣ URL Classifier woke up to classify urls!")
13
13
  for index, row_prefix in df.iterrows():
14
- df_urls = urls_manager.get_url_like_unclassified(like_condition=row_prefix["url_prefix"])
14
+ df_urls = urls_manager.get_url_like_unclassified(
15
+ like_condition=row_prefix["url_prefix"]
16
+ )
15
17
  for index, row_urls in df_urls.iterrows():
16
- urls_manager.set_url_type_by_id(url_id =row_urls["id"], url_type=row_prefix["url_type"])
18
+ urls_manager.set_url_type_by_id(
19
+ url_id=row_urls["id"], url_type=row_prefix["url_type"]
20
+ )
17
21
 
18
22
  if not recursive:
19
- print("ending...")
23
+ print("#️⃣ URL Classifier said: I'm done! See you soon...")
20
24
  keep_alive = False
21
25
  else:
22
- print("sleeping...")
26
+ print("#️⃣ URL Classifier is taking a nap...")
23
27
  time.sleep(10)