ohmyscrapper 0.4.0__tar.gz → 0.7.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/PKG-INFO +6 -3
  2. {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/README.md +1 -1
  3. {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/pyproject.toml +9 -2
  4. {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/src/ohmyscrapper/__init__.py +41 -6
  5. ohmyscrapper-0.7.4/src/ohmyscrapper/core/config.py +113 -0
  6. ohmyscrapper-0.7.4/src/ohmyscrapper/core/config_files.py +100 -0
  7. ohmyscrapper-0.7.4/src/ohmyscrapper/core/default_files/config.yaml +19 -0
  8. ohmyscrapper-0.7.4/src/ohmyscrapper/core/default_files/url_sniffing.yaml +29 -0
  9. ohmyscrapper-0.7.4/src/ohmyscrapper/core/default_files/url_types.yaml +5 -0
  10. {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/src/ohmyscrapper/models/urls_manager.py +58 -31
  11. {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/src/ohmyscrapper/modules/classify_urls.py +5 -1
  12. {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/src/ohmyscrapper/modules/load_txt.py +26 -20
  13. {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/src/ohmyscrapper/modules/process_with_ai.py +17 -13
  14. ohmyscrapper-0.7.4/src/ohmyscrapper/modules/scrap_urls.py +193 -0
  15. ohmyscrapper-0.7.4/src/ohmyscrapper/modules/seed.py +36 -0
  16. {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/src/ohmyscrapper/modules/show.py +15 -14
  17. ohmyscrapper-0.7.4/src/ohmyscrapper/modules/sniff_url.py +165 -0
  18. ohmyscrapper-0.4.0/src/ohmyscrapper/modules/scrap_urls.py +0 -209
  19. ohmyscrapper-0.4.0/src/ohmyscrapper/modules/seed.py +0 -7
  20. ohmyscrapper-0.4.0/src/ohmyscrapper/modules/sniff_url.py +0 -88
  21. {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/src/ohmyscrapper/__main__.py +0 -0
  22. {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/src/ohmyscrapper/modules/merge_dbs.py +0 -0
  23. {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/src/ohmyscrapper/modules/untouch_all.py +0 -0
@@ -1,9 +1,10 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: ohmyscrapper
3
- Version: 0.4.0
3
+ Version: 0.7.4
4
4
  Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
5
5
  Author: Cesar Cardoso
6
6
  Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
7
+ License-Expression: MIT
7
8
  Requires-Dist: beautifulsoup4>=4.14.3
8
9
  Requires-Dist: google-genai>=1.55.0
9
10
  Requires-Dist: markdown>=3.10
@@ -14,9 +15,11 @@ Requires-Dist: requests>=2.32.5
14
15
  Requires-Dist: rich>=14.2.0
15
16
  Requires-Dist: urlextract>=1.9.0
16
17
  Requires-Python: >=3.11
18
+ Project-URL: Changelog, https://github.com/bouli/ohmyscrapper/releases/latest
19
+ Project-URL: Repository, https://github.com/bouli/ohmyscrapper
17
20
  Description-Content-Type: text/markdown
18
21
 
19
- # 🐶 OhMyScrapper - v0.4.0
22
+ # 🐶 OhMyScrapper - v0.7.4
20
23
 
21
24
  OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
22
25
  final report with general information about job positions.
@@ -1,4 +1,4 @@
1
- # 🐶 OhMyScrapper - v0.4.0
1
+ # 🐶 OhMyScrapper - v0.7.4
2
2
 
3
3
  OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
4
4
  final report with general information about job positions.
@@ -1,6 +1,7 @@
1
1
  [project]
2
2
  name = "ohmyscrapper"
3
- version = "0.4.0"
3
+ version = "0.7.4"
4
+ license = "MIT"
4
5
  description = "OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions."
5
6
  readme = "README.md"
6
7
  authors = [
@@ -19,6 +20,10 @@ dependencies = [
19
20
  "urlextract>=1.9.0",
20
21
  ]
21
22
 
23
+ [project.urls]
24
+ Repository = "https://github.com/bouli/ohmyscrapper"
25
+ Changelog = "https://github.com/bouli/ohmyscrapper/releases/latest"
26
+
22
27
  [project.scripts]
23
28
  ohmyscrapper = "ohmyscrapper:main"
24
29
 
@@ -29,11 +34,13 @@ build-backend = "uv_build"
29
34
  [tool.bumpversion]
30
35
  tag = true
31
36
  tag_name = "v{new_version}"
32
- pre_commit_hooks = ["uv sync --upgrade", "git add uv.lock"]
37
+ pre_commit_hooks = ["uvx black ./src", "git add src", "git commit -m 'chore: Beautify with black'", "uv sync --upgrade", "git add uv.lock"]
33
38
  commit = true
34
39
 
35
40
  [[tool.bumpversion.files]]
36
41
  filename = "pyproject.toml"
42
+ search = 'version = "{current_version}"'
43
+ replace = 'version = "{new_version}"'
37
44
 
38
45
  [[tool.bumpversion.files]]
39
46
  filename = "README.md"
@@ -3,7 +3,7 @@ import argparse
3
3
  from ohmyscrapper.modules.classify_urls import classify_urls
4
4
  from ohmyscrapper.modules.sniff_url import sniff_url
5
5
  from ohmyscrapper.modules.load_txt import load_txt
6
- from ohmyscrapper.modules.seed import seed
6
+ from ohmyscrapper.modules.seed import seed, export_url_types_to_file
7
7
  from ohmyscrapper.modules.scrap_urls import scrap_urls
8
8
  from ohmyscrapper.modules.show import (
9
9
  show_url,
@@ -15,17 +15,22 @@ from ohmyscrapper.modules.show import (
15
15
  from ohmyscrapper.modules.untouch_all import untouch_all
16
16
  from ohmyscrapper.modules.process_with_ai import process_with_ai, reprocess_ai_history
17
17
  from ohmyscrapper.modules.merge_dbs import merge_dbs
18
+ from ohmyscrapper.core.config import update
18
19
 
19
20
 
20
21
  def main():
21
22
  parser = argparse.ArgumentParser(prog="ohmyscrapper")
22
- parser.add_argument("--version", action="version", version="%(prog)s v0.4.0")
23
+ parser.add_argument("--version", action="version", version="%(prog)s v0.7.4")
23
24
 
25
+ update()
24
26
  subparsers = parser.add_subparsers(dest="command", help="Available commands")
25
27
  start_parser = subparsers.add_parser(
26
28
  "start",
27
29
  help="Make the entire process of 📦 loading, 🐶 scraping and 📜🖋️ exporting with the default configuration.",
28
30
  )
31
+ start_parser.add_argument(
32
+ "-input", default=None, help="File/Folder path or url for pre-loading."
33
+ )
29
34
 
30
35
  start_parser.add_argument(
31
36
  "--ai",
@@ -40,8 +45,22 @@ def main():
40
45
  )
41
46
 
42
47
  seed_parser = subparsers.add_parser(
43
- "seed", help="Seed database. Necessary to classify urls."
48
+ "seed", help="Seed database with `url_types` to classify the `urls`."
49
+ )
50
+ seed_parser.add_argument(
51
+ "--export",
52
+ default=False,
53
+ help="Add all `url_types` from the bank to the `/ohmyscrapper/url_types.yaml` file.",
54
+ action="store_true",
44
55
  )
56
+
57
+ seed_parser.add_argument(
58
+ "--reset",
59
+ default=False,
60
+ help="Reset all `url_types`.",
61
+ action="store_true",
62
+ )
63
+
45
64
  untouch_parser = subparsers.add_parser(
46
65
  "untouch-all", help="Untouch all urls. That resets classification"
47
66
  )
@@ -54,7 +73,9 @@ def main():
54
73
  )
55
74
 
56
75
  load_txt_parser = subparsers.add_parser("load", help="📦 Load txt file")
57
- load_txt_parser.add_argument("-input", default=None, help="File path or url.")
76
+ load_txt_parser.add_argument(
77
+ "-input", default=None, help="File/Folder path or url."
78
+ )
58
79
  load_txt_parser.add_argument(
59
80
  "--verbose", default=False, help="Run in verbose mode", action="store_true"
60
81
  )
@@ -75,6 +96,9 @@ def main():
75
96
  scrap_urls_parser.add_argument(
76
97
  "--verbose", default=False, help="Run in verbose mode", action="store_true"
77
98
  )
99
+ scrap_urls_parser.add_argument(
100
+ "-input", default=None, help="File/Folder path or url for pre-loading."
101
+ )
78
102
 
79
103
  sniff_url_parser = subparsers.add_parser("sniff-url", help="🐕 Sniff/Check url")
80
104
  sniff_url_parser.add_argument(
@@ -118,7 +142,10 @@ def main():
118
142
  return
119
143
 
120
144
  if args.command == "seed":
121
- seed()
145
+ if args.export:
146
+ export_url_types_to_file()
147
+ else:
148
+ seed(args.reset)
122
149
  return
123
150
 
124
151
  if args.command == "untouch-all":
@@ -130,6 +157,9 @@ def main():
130
157
  return
131
158
 
132
159
  if args.command == "scrap-urls":
160
+ if args.input != None:
161
+ load_txt(file_name=args.input, verbose=args.verbose)
162
+
133
163
  scrap_urls(
134
164
  recursive=args.recursive,
135
165
  ignore_valid_prefix=args.ignore_type,
@@ -169,7 +199,12 @@ def main():
169
199
  return
170
200
 
171
201
  if args.command == "start":
172
- load_txt()
202
+ seed()
203
+ if args.input != None:
204
+ load_txt(file_name=args.input)
205
+ else:
206
+ load_txt()
207
+
173
208
  scrap_urls(
174
209
  recursive=True,
175
210
  ignore_valid_prefix=True,
@@ -0,0 +1,113 @@
1
+ import os
2
+ from ohmyscrapper.core import config_files
3
+
4
+ default_app_dir = "ohmyscrapper"
5
+
6
+
7
+ def get_dir(param="ohmyscrapper"):
8
+ parent_param = "default_dirs"
9
+
10
+ if param == default_app_dir:
11
+ folder = "./" + param
12
+ else:
13
+ folder = config_files.get_param(
14
+ parent_param=parent_param, param=param, default_app_dir=default_app_dir
15
+ )
16
+ if not os.path.exists(folder):
17
+ os.mkdir(folder)
18
+ return folder
19
+
20
+
21
+ def get_files(param):
22
+ parent_param = "default_files"
23
+ return config_files.get_param(
24
+ parent_param=parent_param, param=param, default_app_dir=default_app_dir
25
+ )
26
+
27
+
28
+ def get_db(param="db_file"):
29
+ if param == "folder":
30
+ return get_dir(param="db")
31
+ return config_files.get_param(
32
+ parent_param="db", param=param, default_app_dir=default_app_dir
33
+ )
34
+
35
+
36
+ def get_ai(param):
37
+ return config_files.get_param(
38
+ parent_param="ai", param=param, default_app_dir=default_app_dir
39
+ )
40
+
41
+
42
+ def get_sniffing(param):
43
+ return config_files.get_param(
44
+ parent_param="sniffing", param=param, default_app_dir=default_app_dir
45
+ )
46
+
47
+
48
+ def load_config(force_default=False):
49
+ config_file_name = "config.yaml"
50
+ config_params = config_files.create_and_read_config_file(
51
+ file_name=config_file_name,
52
+ default_app_dir=default_app_dir,
53
+ force_default=force_default,
54
+ )
55
+
56
+ if config_params is None or "default_dirs" not in config_params:
57
+ config_params = load_config(force_default=True)
58
+
59
+ return config_params
60
+
61
+
62
+ def url_types_file_exists():
63
+ url_types_file = get_files("url_types")
64
+ return config_files.config_file_exists(
65
+ url_types_file, default_app_dir=default_app_dir
66
+ )
67
+
68
+
69
+ def get_url_types():
70
+ url_types_file = get_files("url_types")
71
+ return config_files.create_and_read_config_file(
72
+ url_types_file, default_app_dir=default_app_dir
73
+ )
74
+
75
+
76
+ def get_url_sniffing():
77
+ file = get_files("url_sniffing")
78
+ return config_files.create_and_read_config_file(
79
+ file, default_app_dir=default_app_dir
80
+ )
81
+
82
+
83
+ def append_url_sniffing(data):
84
+ file = get_files("url_sniffing")
85
+ _append_config_file(data, file)
86
+
87
+
88
+ def append_url_types(url_types):
89
+ url_types_file = get_files("url_types")
90
+ _append_config_file(url_types, url_types_file)
91
+
92
+
93
+ def overwrite_config_file(data, file_name):
94
+ config_files.overwrite_config_file(data, file_name, default_app_dir=default_app_dir)
95
+
96
+
97
+ def _append_config_file(data, file_name):
98
+ config_files.append_config_file(data, file_name, default_app_dir=default_app_dir)
99
+
100
+
101
+ def update():
102
+ legacy_folder = "./customize"
103
+ new_folder = "./ohmyscrapper"
104
+ if os.path.exists(legacy_folder) and not os.path.exists(new_folder):
105
+ yes_no = input(
106
+ "We detected a legacy folder system for your OhMyScrapper, would you like to update? \n"
107
+ "If you don't update, a new version will be used and your legacy folder will be ignored. \n"
108
+ "[Y] for yes or any other thing to ignore: "
109
+ )
110
+ if yes_no == "Y":
111
+ os.rename(legacy_folder, new_folder)
112
+ print(" You are up-to-date! =)")
113
+ print("")
@@ -0,0 +1,100 @@
1
+ import os
2
+ import yaml
3
+
4
+
5
+ def create_and_read_config_file(file_name, default_app_dir, force_default=False):
6
+ config_file = config_file_path(file_name, default_app_dir)
7
+ default_config_params = _get_default_file(default_file=file_name)
8
+ if force_default or not os.path.exists(config_file):
9
+ overwrite_config_file(
10
+ data=default_config_params,
11
+ file_name=file_name,
12
+ default_app_dir=default_app_dir,
13
+ )
14
+ config_params = default_config_params
15
+ else:
16
+ with open(config_file, "r") as f:
17
+ config_params = yaml.safe_load(f.read())
18
+ if complete_config_file(
19
+ config_params=config_params,
20
+ default_config_params=default_config_params,
21
+ file_name=file_name,
22
+ default_app_dir=default_app_dir,
23
+ ):
24
+ config_params = create_and_read_config_file(
25
+ file_name=file_name,
26
+ default_app_dir=default_app_dir,
27
+ force_default=force_default,
28
+ )
29
+
30
+ if config_params is None:
31
+ config_params = create_and_read_config_file(
32
+ file_name=file_name, default_app_dir=default_app_dir, force_default=True
33
+ )
34
+ return config_params
35
+
36
+
37
+ def complete_config_file(
38
+ config_params, default_config_params, file_name, default_app_dir
39
+ ):
40
+ has_updated = False
41
+ for key, values in default_config_params.items():
42
+ if key not in config_params.keys():
43
+ has_updated = True
44
+ data = {key: values}
45
+ append_config_file(data, file_name, default_app_dir)
46
+ return has_updated
47
+
48
+
49
+ def overwrite_config_file(data, file_name, default_app_dir):
50
+ config_file = config_file_path(file_name, default_app_dir)
51
+ with open(config_file, "+w") as f:
52
+ f.write(yaml.safe_dump(data))
53
+
54
+
55
+ def append_config_file(data, file_name, default_app_dir):
56
+ config_file = config_file_path(file_name, default_app_dir)
57
+ # append
58
+ with open(config_file, "+a") as f:
59
+ yaml.dump(data, f, allow_unicode=True)
60
+ # read
61
+ with open(config_file, "r") as f:
62
+ data = yaml.safe_load(f.read())
63
+ # overwrite preventing repetition
64
+ with open(config_file, "w") as f:
65
+ yaml.dump(data, f, allow_unicode=True)
66
+
67
+
68
+ def get_param(parent_param, param, default_app_dir):
69
+ default_dirs = create_and_read_config_file(
70
+ file_name="config.yaml", default_app_dir=default_app_dir
71
+ )[parent_param]
72
+
73
+ if param in default_dirs:
74
+ return default_dirs[param]
75
+ else:
76
+ raise Exception(f"{param} do not exist in your params {parent_param}.")
77
+
78
+
79
+ def config_file_exists(file_name, default_app_dir):
80
+ return os.path.exists(config_file_path(file_name, default_app_dir))
81
+
82
+
83
+ def config_file_path(file_name, default_app_dir):
84
+ _ensure_default_app_dir(default_app_dir)
85
+ config_file = os.path.join(default_app_dir, file_name)
86
+ return config_file
87
+
88
+
89
+ def _ensure_default_app_dir(default_app_dir):
90
+ if not os.path.exists(default_app_dir):
91
+ os.mkdir(default_app_dir)
92
+
93
+
94
+ def _get_default_file(default_file):
95
+ default_files_dir = os.path.join(
96
+ os.path.dirname(os.path.realpath(__file__)), "default_files"
97
+ )
98
+ default_file = os.path.join(default_files_dir, default_file)
99
+ with open(default_file, "r") as f:
100
+ return yaml.safe_load(f.read())
@@ -0,0 +1,19 @@
1
+ db:
2
+ db_file: local.db
3
+
4
+ default_dirs:
5
+ db: ./db
6
+ input: ./input
7
+ output: ./output
8
+ prompts: ./prompts
9
+ templates: ./templates
10
+
11
+ default_files:
12
+ url_types: url_types.yaml
13
+ url_sniffing: url_sniffing.yaml
14
+
15
+ ai:
16
+ default_prompt_file: prompt.md
17
+
18
+ sniffing:
19
+ timeout: 10
@@ -0,0 +1,29 @@
1
+ linkedin_feed:
2
+ metatags:
3
+ og:url: url_destiny
4
+
5
+ linkedin_job:
6
+ bodytags:
7
+ h1: title
8
+ metatags:
9
+ og:title: title
10
+ og:description: description
11
+ description: description
12
+
13
+ linkedin_post:
14
+ bodytags:
15
+ h1: title
16
+ metatags:
17
+ og:title: title
18
+ og:description: description
19
+ description: description
20
+
21
+ linkedin_redirect:
22
+ metatags:
23
+ og:url: url_destiny
24
+ atags:
25
+ first-tag-as-url_destiny: 5
26
+
27
+ read_all_a_tags:
28
+ atags:
29
+ load_atags: True
@@ -0,0 +1,5 @@
1
+ linkedin_company: https://%.linkedin.com/company/%
2
+ linkedin_feed: https://%.linkedin.com/feed/%
3
+ linkedin_job: https://%.linkedin.com/jobs/view/%
4
+ linkedin_post: https://%.linkedin.com/posts/%
5
+ linkedin_redirect: https://lnkd.in/%
@@ -4,16 +4,19 @@ import time
4
4
  import glob
5
5
  import pandas as pd
6
6
  from urllib.parse import urlparse, urlunparse
7
+ from ohmyscrapper.core import config
7
8
 
8
9
 
9
10
  def get_db_dir():
10
- if not os.path.exists("db"):
11
- os.mkdir("db")
12
- return "db"
11
+ db_folder = config.get_dir("db")
12
+ if not os.path.exists(db_folder):
13
+ os.mkdir(db_folder)
14
+ return db_folder
13
15
 
14
16
 
15
17
  def get_db_path():
16
- return get_db_dir() + "/local.db"
18
+ db_file = config.get_db()
19
+ return os.path.join(get_db_dir(), db_file)
17
20
 
18
21
 
19
22
  def get_db_connection():
@@ -26,7 +29,11 @@ def use_connection(func):
26
29
  def provide_connection(*args, **kwargs):
27
30
  global conn
28
31
  with get_db_connection() as conn:
29
- return func(*args, **kwargs)
32
+ try:
33
+ return func(*args, **kwargs)
34
+ except:
35
+ update_db()
36
+ return func(*args, **kwargs)
30
37
 
31
38
  return provide_connection
32
39
 
@@ -35,7 +42,7 @@ def create_tables(conn):
35
42
 
36
43
  c = conn.cursor()
37
44
  c.execute(
38
- "CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT, h1 TEXT, error TEXT, description TEXT, description_links INTEGER DEFAULT 0, json TEXT, json_ai TEXT, ai_processed INTEGER DEFAULT 0, history INTEGER DEFAULT 0, last_touch DATETIME, created_at DATETIME)"
45
+ "CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT, title TEXT, error TEXT, description TEXT, description_links INTEGER DEFAULT 0, json TEXT, json_ai TEXT, ai_processed INTEGER DEFAULT 0, history INTEGER DEFAULT 0, last_touch DATETIME, created_at DATETIME)"
39
46
  )
40
47
  c.execute(
41
48
  "CREATE TABLE IF NOT EXISTS ai_log (id INTEGER PRIMARY KEY, instructions STRING, response STRING, model STRING, prompt_file STRING, prompt_name STRING, created_at DATETIME)"
@@ -46,16 +53,30 @@ def create_tables(conn):
46
53
  )
47
54
 
48
55
 
49
- def seeds():
50
- add_urls_valid_prefix("https://%.linkedin.com/posts/%", "linkedin_post")
51
- add_urls_valid_prefix("https://lnkd.in/%", "linkedin_redirect")
52
- add_urls_valid_prefix("https://%.linkedin.com/jobs/view/%", "linkedin_job")
53
- add_urls_valid_prefix("https://%.linkedin.com/feed/%", "linkedin_feed")
54
- add_urls_valid_prefix("https://%.linkedin.com/company/%", "linkedin_company")
56
+ def update_db():
57
+ try:
58
+ c = conn.cursor()
59
+ c.execute("ALTER TABLE urls RENAME COLUMN h1 TO title")
60
+ except:
61
+ pass
62
+
63
+
64
+ def seeds(seeds={}):
65
+
66
+ for url_type, url_prefix in seeds.items():
67
+ add_urls_valid_prefix(url_prefix, url_type)
55
68
 
56
69
  return True
57
70
 
58
71
 
72
+ @use_connection
73
+ def reset_seeds():
74
+ sql = "DELETE FROM urls_valid_prefix"
75
+ c = conn.cursor()
76
+ c.execute(sql)
77
+ conn.commit()
78
+
79
+
59
80
  @use_connection
60
81
  def add_urls_valid_prefix(url_prefix, url_type):
61
82
 
@@ -117,7 +138,7 @@ def get_urls_report():
117
138
  SELECT
118
139
  u.id,
119
140
  u.url,
120
- u.h1
141
+ u.title
121
142
  FROM urls u
122
143
  INNER JOIN parent_url p
123
144
  ON u.url = p.parent_url
@@ -126,9 +147,9 @@ def get_urls_report():
126
147
  u.id,
127
148
  u.url_type,
128
149
  u.url,
129
- COALESCE(u.h1, p.h1) as h1,
150
+ COALESCE(u.title, p.title) as title,
130
151
  p.url as parent_url,
131
- p.h1 as parent_h1
152
+ p.title as parent_title
132
153
  FROM urls u
133
154
  LEFT JOIN parents p
134
155
  ON u.parent_url = p.url
@@ -184,12 +205,14 @@ def get_url_like_unclassified(like_condition):
184
205
 
185
206
 
186
207
  @use_connection
187
- def add_url(url, h1=None, parent_url=None):
208
+ def add_url(url, title=None, parent_url=None):
209
+ if url[:1] == "/":
210
+ return
188
211
  url = clean_url(url)
189
212
  c = conn.cursor()
190
213
 
191
- if h1 is not None:
192
- h1 = h1.strip()
214
+ if title is not None:
215
+ title = title.strip()
193
216
 
194
217
  if parent_url is None:
195
218
  parent_url = None
@@ -198,8 +221,8 @@ def add_url(url, h1=None, parent_url=None):
198
221
 
199
222
  if len(get_url_by_url(url)) == 0:
200
223
  c.execute(
201
- "INSERT INTO urls (url, h1, parent_url, created_at, ai_processed, description_links, history) VALUES (?, ?, ?, ?, 0, 0, 0)",
202
- (url, h1, parent_url, int(time.time())),
224
+ "INSERT INTO urls (url, title, parent_url, created_at, ai_processed, description_links, history) VALUES (?, ?, ?, ?, 0, 0, 0)",
225
+ (url, title, parent_url, int(time.time())),
203
226
  )
204
227
  conn.commit()
205
228
 
@@ -238,20 +261,20 @@ def set_url_destiny(url, destiny):
238
261
 
239
262
 
240
263
  @use_connection
241
- def set_url_h1(url, value):
264
+ def set_url_title(url, value):
242
265
  value = str(value).strip()
243
266
  url = clean_url(url)
244
267
  c = conn.cursor()
245
- c.execute("UPDATE urls SET h1 = ? WHERE url = ?", (value, url))
268
+ c.execute("UPDATE urls SET title = ? WHERE url = ?", (value, url))
246
269
  conn.commit()
247
270
 
248
271
 
249
272
  @use_connection
250
- def set_url_h1_by_id(id, value):
273
+ def set_url_title_by_id(id, value):
251
274
  value = str(value).strip()
252
275
 
253
276
  c = conn.cursor()
254
- c.execute("UPDATE urls SET h1 = ? WHERE id = ?", (value, id))
277
+ c.execute("UPDATE urls SET title = ? WHERE id = ?", (value, id))
255
278
  conn.commit()
256
279
 
257
280
 
@@ -327,7 +350,9 @@ def set_url_error(url, value):
327
350
  @use_connection
328
351
  def set_url_type_by_id(url_id, url_type):
329
352
  c = conn.cursor()
330
- c.execute(f"UPDATE urls SET url_type = '{url_type}' WHERE id = {url_id}")
353
+ c.execute(
354
+ f"UPDATE urls SET url_type = '{url_type}', last_touch = NULL WHERE id = {url_id}"
355
+ )
331
356
  conn.commit()
332
357
 
333
358
 
@@ -379,8 +404,10 @@ def touch_url(url):
379
404
  @use_connection
380
405
  def untouch_url(url):
381
406
  url = clean_url(url)
407
+ url = str(url.strip())
408
+
382
409
  c = conn.cursor()
383
- c.execute("UPDATE urls SET last_touch = NULL WHERE url = ?", (url))
410
+ c.execute(f"UPDATE urls SET last_touch = NULL, url_type = NULL WHERE url = '{url}'")
384
411
  conn.commit()
385
412
 
386
413
 
@@ -426,16 +453,16 @@ def merge_dbs() -> None:
426
453
 
427
454
 
428
455
  @use_connection
429
- def merge_url(url, h1, last_touch, created_at, description, json):
456
+ def merge_url(url, title, last_touch, created_at, description, json):
430
457
  url = clean_url(url)
431
458
  c = conn.cursor()
432
459
 
433
- if h1 is not None:
434
- h1 = h1.strip()
460
+ if title is not None:
461
+ title = title.strip()
435
462
 
436
463
  if len(get_url_by_url(url)) == 0:
437
464
  c.execute(
438
- "INSERT INTO urls (url, h1, last_touch , created_at, history, ai_processed, description_links, description, json) VALUES (?, ?, ?, ?, 1, 0, 0, ? , ?)",
439
- (url, h1, last_touch, created_at, description, json),
465
+ "INSERT INTO urls (url, title, last_touch , created_at, history, ai_processed, description_links, description, json) VALUES (?, ?, ?, ?, 1, 0, 0, ? , ?)",
466
+ (url, title, last_touch, created_at, description, json),
440
467
  )
441
468
  conn.commit()
@@ -1,11 +1,15 @@
1
1
  import ohmyscrapper.models.urls_manager as urls_manager
2
+ from ohmyscrapper.modules import seed
2
3
  import pandas as pd
3
4
  import time
4
5
 
5
6
 
6
7
  def classify_urls(recursive=False):
7
- urls_manager.seeds()
8
8
  df = urls_manager.get_urls_valid_prefix()
9
+ if len(df) == 0:
10
+ seed.seed()
11
+ classify_urls(recursive=recursive)
12
+ return
9
13
 
10
14
  keep_alive = True
11
15
  while keep_alive: