ohmyscrapper 0.3.4__tar.gz → 0.7.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {ohmyscrapper-0.3.4 → ohmyscrapper-0.7.1}/PKG-INFO +10 -4
  2. {ohmyscrapper-0.3.4 → ohmyscrapper-0.7.1}/README.md +9 -3
  3. {ohmyscrapper-0.3.4 → ohmyscrapper-0.7.1}/pyproject.toml +4 -2
  4. {ohmyscrapper-0.3.4 → ohmyscrapper-0.7.1}/src/ohmyscrapper/__init__.py +33 -7
  5. ohmyscrapper-0.7.1/src/ohmyscrapper/core/config.py +107 -0
  6. ohmyscrapper-0.7.1/src/ohmyscrapper/core/config_files.py +73 -0
  7. ohmyscrapper-0.7.1/src/ohmyscrapper/core/default_files/config.yaml +16 -0
  8. ohmyscrapper-0.7.1/src/ohmyscrapper/core/default_files/url_sniffing.yaml +25 -0
  9. ohmyscrapper-0.7.1/src/ohmyscrapper/core/default_files/url_types.yaml +5 -0
  10. {ohmyscrapper-0.3.4 → ohmyscrapper-0.7.1}/src/ohmyscrapper/models/urls_manager.py +42 -29
  11. {ohmyscrapper-0.3.4 → ohmyscrapper-0.7.1}/src/ohmyscrapper/modules/classify_urls.py +5 -1
  12. {ohmyscrapper-0.3.4 → ohmyscrapper-0.7.1}/src/ohmyscrapper/modules/load_txt.py +28 -22
  13. {ohmyscrapper-0.3.4 → ohmyscrapper-0.7.1}/src/ohmyscrapper/modules/process_with_ai.py +17 -13
  14. ohmyscrapper-0.7.1/src/ohmyscrapper/modules/scrap_urls.py +187 -0
  15. ohmyscrapper-0.7.1/src/ohmyscrapper/modules/seed.py +33 -0
  16. {ohmyscrapper-0.3.4 → ohmyscrapper-0.7.1}/src/ohmyscrapper/modules/show.py +15 -14
  17. ohmyscrapper-0.7.1/src/ohmyscrapper/modules/sniff_url.py +155 -0
  18. ohmyscrapper-0.3.4/src/ohmyscrapper/modules/scrap_urls.py +0 -209
  19. ohmyscrapper-0.3.4/src/ohmyscrapper/modules/seed.py +0 -7
  20. ohmyscrapper-0.3.4/src/ohmyscrapper/modules/sniff_url.py +0 -88
  21. {ohmyscrapper-0.3.4 → ohmyscrapper-0.7.1}/src/ohmyscrapper/__main__.py +0 -0
  22. {ohmyscrapper-0.3.4 → ohmyscrapper-0.7.1}/src/ohmyscrapper/modules/merge_dbs.py +0 -0
  23. {ohmyscrapper-0.3.4 → ohmyscrapper-0.7.1}/src/ohmyscrapper/modules/untouch_all.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ohmyscrapper
3
- Version: 0.3.4
3
+ Version: 0.7.1
4
4
  Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
5
5
  Author: Cesar Cardoso
6
6
  Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
@@ -16,7 +16,7 @@ Requires-Dist: urlextract>=1.9.0
16
16
  Requires-Python: >=3.11
17
17
  Description-Content-Type: text/markdown
18
18
 
19
- # 🐶 OhMyScrapper - v0.3.4
19
+ # 🐶 OhMyScrapper - v0.7.1
20
20
 
21
21
  OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
22
22
  final report with general information about job positions.
@@ -78,10 +78,16 @@ in this folder and use the command `load`:
78
78
  ```shell
79
79
  ohmyscrapper load
80
80
  ```
81
- or, if you have another file in a different folder, just use the argument `-file` like this:
81
+ or, if you have another file in a different folder, just use the argument `-input` like this:
82
82
  ```shell
83
- ohmyscrapper load -file=my-text-file.txt
83
+ ohmyscrapper load -input=my-text-file.txt
84
84
  ```
85
+ In this case, you can add an url directly to the database, like this:
86
+ ```shell
87
+ ohmyscrapper load -input=https://cesarcardoso.cc/
88
+ ```
89
+ That will append the last url in the database to be scraped.
90
+
85
91
  That will create a database if it doesn't exist and store every url the oh-my-scrapper
86
92
  find. After that, let's scrap the urls with the command `scrap-urls`:
87
93
 
@@ -1,4 +1,4 @@
1
- # 🐶 OhMyScrapper - v0.3.4
1
+ # 🐶 OhMyScrapper - v0.7.1
2
2
 
3
3
  OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
4
4
  final report with general information about job positions.
@@ -60,10 +60,16 @@ in this folder and use the command `load`:
60
60
  ```shell
61
61
  ohmyscrapper load
62
62
  ```
63
- or, if you have another file in a different folder, just use the argument `-file` like this:
63
+ or, if you have another file in a different folder, just use the argument `-input` like this:
64
64
  ```shell
65
- ohmyscrapper load -file=my-text-file.txt
65
+ ohmyscrapper load -input=my-text-file.txt
66
66
  ```
67
+ In this case, you can add an url directly to the database, like this:
68
+ ```shell
69
+ ohmyscrapper load -input=https://cesarcardoso.cc/
70
+ ```
71
+ That will append the last url in the database to be scraped.
72
+
67
73
  That will create a database if it doesn't exist and store every url the oh-my-scrapper
68
74
  find. After that, let's scrap the urls with the command `scrap-urls`:
69
75
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ohmyscrapper"
3
- version = "0.3.4"
3
+ version = "0.7.1"
4
4
  description = "OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions."
5
5
  readme = "README.md"
6
6
  authors = [
@@ -29,11 +29,13 @@ build-backend = "uv_build"
29
29
  [tool.bumpversion]
30
30
  tag = true
31
31
  tag_name = "v{new_version}"
32
- pre_commit_hooks = ["uv sync --upgrade", "git add uv.lock"]
32
+ pre_commit_hooks = ["uvx black ./src", "git add src", "git commit -m 'chore: Beautify with black'", "uv sync --upgrade", "git add uv.lock"]
33
33
  commit = true
34
34
 
35
35
  [[tool.bumpversion.files]]
36
36
  filename = "pyproject.toml"
37
+ search = 'version = "{current_version}"'
38
+ replace = 'version = "{new_version}"'
37
39
 
38
40
  [[tool.bumpversion.files]]
39
41
  filename = "README.md"
@@ -3,7 +3,7 @@ import argparse
3
3
  from ohmyscrapper.modules.classify_urls import classify_urls
4
4
  from ohmyscrapper.modules.sniff_url import sniff_url
5
5
  from ohmyscrapper.modules.load_txt import load_txt
6
- from ohmyscrapper.modules.seed import seed
6
+ from ohmyscrapper.modules.seed import seed, export_url_types_to_file
7
7
  from ohmyscrapper.modules.scrap_urls import scrap_urls
8
8
  from ohmyscrapper.modules.show import (
9
9
  show_url,
@@ -15,17 +15,22 @@ from ohmyscrapper.modules.show import (
15
15
  from ohmyscrapper.modules.untouch_all import untouch_all
16
16
  from ohmyscrapper.modules.process_with_ai import process_with_ai, reprocess_ai_history
17
17
  from ohmyscrapper.modules.merge_dbs import merge_dbs
18
+ from ohmyscrapper.core.config import update
18
19
 
19
20
 
20
21
  def main():
21
22
  parser = argparse.ArgumentParser(prog="ohmyscrapper")
22
- parser.add_argument("--version", action="version", version="%(prog)s v0.3.4")
23
+ parser.add_argument("--version", action="version", version="%(prog)s v0.7.1")
23
24
 
25
+ update()
24
26
  subparsers = parser.add_subparsers(dest="command", help="Available commands")
25
27
  start_parser = subparsers.add_parser(
26
28
  "start",
27
29
  help="Make the entire process of 📦 loading, 🐶 scraping and 📜🖋️ exporting with the default configuration.",
28
30
  )
31
+ start_parser.add_argument(
32
+ "-input", default=None, help="File/Folder path or url for pre-loading."
33
+ )
29
34
 
30
35
  start_parser.add_argument(
31
36
  "--ai",
@@ -40,7 +45,13 @@ def main():
40
45
  )
41
46
 
42
47
  seed_parser = subparsers.add_parser(
43
- "seed", help="Seed database. Necessary to classify urls."
48
+ "seed", help="Seed database with `url_types` to classify the `urls`."
49
+ )
50
+ seed_parser.add_argument(
51
+ "--export",
52
+ default=False,
53
+ help="Add all `url_types` from the bank to the `/ohmyscrapper/url_types.yaml` file.",
54
+ action="store_true",
44
55
  )
45
56
  untouch_parser = subparsers.add_parser(
46
57
  "untouch-all", help="Untouch all urls. That resets classification"
@@ -54,7 +65,9 @@ def main():
54
65
  )
55
66
 
56
67
  load_txt_parser = subparsers.add_parser("load", help="📦 Load txt file")
57
- load_txt_parser.add_argument("-file", default=None, help="File path.")
68
+ load_txt_parser.add_argument(
69
+ "-input", default=None, help="File/Folder path or url."
70
+ )
58
71
  load_txt_parser.add_argument(
59
72
  "--verbose", default=False, help="Run in verbose mode", action="store_true"
60
73
  )
@@ -75,6 +88,9 @@ def main():
75
88
  scrap_urls_parser.add_argument(
76
89
  "--verbose", default=False, help="Run in verbose mode", action="store_true"
77
90
  )
91
+ scrap_urls_parser.add_argument(
92
+ "-input", default=None, help="File/Folder path or url for pre-loading."
93
+ )
78
94
 
79
95
  sniff_url_parser = subparsers.add_parser("sniff-url", help="🐕 Sniff/Check url")
80
96
  sniff_url_parser.add_argument(
@@ -114,11 +130,14 @@ def main():
114
130
  return
115
131
 
116
132
  if args.command == "load":
117
- load_txt(file_name=args.file, verbose=args.verbose)
133
+ load_txt(file_name=args.input, verbose=args.verbose)
118
134
  return
119
135
 
120
136
  if args.command == "seed":
121
- seed()
137
+ if args.export:
138
+ export_url_types_to_file()
139
+ else:
140
+ seed()
122
141
  return
123
142
 
124
143
  if args.command == "untouch-all":
@@ -130,6 +149,9 @@ def main():
130
149
  return
131
150
 
132
151
  if args.command == "scrap-urls":
152
+ if args.input != None:
153
+ load_txt(file_name=args.input, verbose=args.verbose)
154
+
133
155
  scrap_urls(
134
156
  recursive=args.recursive,
135
157
  ignore_valid_prefix=args.ignore_type,
@@ -169,7 +191,11 @@ def main():
169
191
  return
170
192
 
171
193
  if args.command == "start":
172
- load_txt()
194
+ if args.input != None:
195
+ load_txt(file_name=args.input)
196
+ else:
197
+ load_txt()
198
+
173
199
  scrap_urls(
174
200
  recursive=True,
175
201
  ignore_valid_prefix=True,
@@ -0,0 +1,107 @@
1
+ import os
2
+ from ohmyscrapper.core import config_files
3
+
4
+ default_app_dir = "ohmyscrapper"
5
+
6
+
7
+ def get_dir(param="ohmyscrapper"):
8
+ parent_param = "default_dirs"
9
+
10
+ if param == default_app_dir:
11
+ folder = "./" + param
12
+ else:
13
+ folder = config_files.get_param(
14
+ parent_param=parent_param, param=param, default_app_dir=default_app_dir
15
+ )
16
+ if not os.path.exists(folder):
17
+ os.mkdir(folder)
18
+ return folder
19
+
20
+
21
+ def get_files(param):
22
+ parent_param = "default_files"
23
+ return config_files.get_param(
24
+ parent_param=parent_param, param=param, default_app_dir=default_app_dir
25
+ )
26
+
27
+
28
+ def get_db(param="db_file"):
29
+ if param == "folder":
30
+ return get_dir(param="db")
31
+ return config_files.get_param(
32
+ parent_param="db", param=param, default_app_dir=default_app_dir
33
+ )
34
+
35
+
36
+ def get_ai(param):
37
+ return config_files.get_param(
38
+ parent_param="ai", param=param, default_app_dir=default_app_dir
39
+ )
40
+
41
+
42
+ def load_config(force_default=False):
43
+ config_file_name = "config.yaml"
44
+ config_params = config_files.create_and_read_config_file(
45
+ file_name=config_file_name,
46
+ default_app_dir=default_app_dir,
47
+ force_default=force_default,
48
+ )
49
+
50
+ if config_params is None or "default_dirs" not in config_params:
51
+ config_params = load_config(force_default=True)
52
+
53
+ return config_params
54
+
55
+
56
+ def url_types_file_exists():
57
+ url_types_file = get_files("url_types")
58
+ return config_files.config_file_exists(
59
+ url_types_file, default_app_dir=default_app_dir
60
+ )
61
+
62
+
63
+ def get_url_types():
64
+ url_types_file = get_files("url_types")
65
+ return config_files.create_and_read_config_file(
66
+ url_types_file, default_app_dir=default_app_dir
67
+ )
68
+
69
+
70
+ def get_url_sniffing():
71
+ file = get_files("url_sniffing")
72
+ return config_files.create_and_read_config_file(
73
+ file, default_app_dir=default_app_dir
74
+ )
75
+
76
+
77
+ def append_url_sniffing(data):
78
+ file = get_files("url_sniffing")
79
+ _append_config_file(data, file)
80
+
81
+
82
+ def append_url_types(url_types):
83
+ url_types_file = get_files("url_types")
84
+ _append_config_file(url_types, url_types_file)
85
+
86
+
87
+ def overwrite_config_file(data, file_name):
88
+ config_files.overwrite_config_file(data, file_name, default_app_dir=default_app_dir)
89
+
90
+
91
+ def _append_config_file(data, file_name):
92
+ config_files.append_config_file(data, file_name, default_app_dir=default_app_dir)
93
+
94
+
95
+ def update():
96
+ legacy_folder = "./customize"
97
+ new_folder = "./ohmyscrapper"
98
+ if os.path.exists(legacy_folder) and not os.path.exists(new_folder):
99
+ yes_no = input(
100
+ "We detected a legacy folder system for your OhMyScrapper, would you like to update? \n"
101
+ "If you don't update, a new version will be used and your legacy folder will be ignored. \n"
102
+ "[Y] for yes or any other thing to ignore: "
103
+ )
104
+ if yes_no == "Y":
105
+ os.rename(legacy_folder, new_folder)
106
+ print(" You are up-to-date! =)")
107
+ print("")
@@ -0,0 +1,73 @@
1
+ import os
2
+ import yaml
3
+
4
+
5
+ def create_and_read_config_file(file_name, default_app_dir, force_default=False):
6
+ config_file = config_file_path(file_name, default_app_dir)
7
+ if force_default or not os.path.exists(config_file):
8
+ config_params = _get_default_file(default_file=file_name)
9
+ overwrite_config_file(
10
+ data=config_params, file_name=file_name, default_app_dir=default_app_dir
11
+ )
12
+ else:
13
+ with open(config_file, "r") as f:
14
+ config_params = yaml.safe_load(f.read())
15
+ if config_params is None:
16
+ config_params = create_and_read_config_file(
17
+ file_name=file_name, default_app_dir=default_app_dir, force_default=True
18
+ )
19
+ return config_params
20
+
21
+
22
+ def overwrite_config_file(data, file_name, default_app_dir):
23
+ config_file = config_file_path(file_name, default_app_dir)
24
+ with open(config_file, "+w") as f:
25
+ f.write(yaml.safe_dump(data))
26
+
27
+
28
+ def append_config_file(data, file_name, default_app_dir):
29
+ config_file = config_file_path(file_name, default_app_dir)
30
+ # append
31
+ with open(config_file, "+a") as f:
32
+ yaml.dump(data, f, allow_unicode=True)
33
+ # read
34
+ with open(config_file, "r") as f:
35
+ data = yaml.safe_load(f.read())
36
+ # overwrite preventing repetition
37
+ with open(config_file, "w") as f:
38
+ yaml.dump(data, f, allow_unicode=True)
39
+
40
+
41
+ def get_param(parent_param, param, default_app_dir):
42
+ default_dirs = create_and_read_config_file(
43
+ file_name="config.yaml", default_app_dir=default_app_dir
44
+ )[parent_param]
45
+
46
+ if param in default_dirs:
47
+ return default_dirs[param]
48
+ else:
49
+ raise Exception(f"{param} do not exist in your params {parent_param}.")
50
+
51
+
52
+ def config_file_exists(file_name, default_app_dir):
53
+ return os.path.exists(config_file_path(file_name, default_app_dir))
54
+
55
+
56
+ def config_file_path(file_name, default_app_dir):
57
+ _ensure_default_app_dir(default_app_dir)
58
+ config_file = os.path.join(default_app_dir, file_name)
59
+ return config_file
60
+
61
+
62
+ def _ensure_default_app_dir(default_app_dir):
63
+ if not os.path.exists(default_app_dir):
64
+ os.mkdir(default_app_dir)
65
+
66
+
67
+ def _get_default_file(default_file):
68
+ default_files_dir = os.path.join(
69
+ os.path.dirname(os.path.realpath(__file__)), "default_files"
70
+ )
71
+ default_file = os.path.join(default_files_dir, default_file)
72
+ with open(default_file, "r") as f:
73
+ return yaml.safe_load(f.read())
@@ -0,0 +1,16 @@
1
+ db:
2
+ db_file: local.db
3
+
4
+ default_dirs:
5
+ db: ./db
6
+ input: ./input
7
+ output: ./output
8
+ prompts: ./prompts
9
+ templates: ./templates
10
+
11
+ default_files:
12
+ url_types: url_types.yaml
13
+ url_sniffing: url_sniffing.yaml
14
+
15
+ ai:
16
+ default_prompt_file: prompt.md
@@ -0,0 +1,25 @@
1
+ linkedin_feed:
2
+ metatags:
3
+ og:url: url_destiny
4
+
5
+ linkedin_job:
6
+ bodytags:
7
+ h1: title
8
+ metatags:
9
+ og:title: title
10
+ og:description: description
11
+ description: description
12
+
13
+ linkedin_post:
14
+ bodytags:
15
+ h1: title
16
+ metatags:
17
+ og:title: title
18
+ og:description: description
19
+ description: description
20
+
21
+ linkedin_redirect:
22
+ metatags:
23
+ og:url: url_destiny
24
+ atags:
25
+ first-tag-as-url_destiny: 5
@@ -0,0 +1,5 @@
1
+ linkedin_company: https://%.linkedin.com/company/%
2
+ linkedin_feed: https://%.linkedin.com/feed/%
3
+ linkedin_job: https://%.linkedin.com/jobs/view/%
4
+ linkedin_post: https://%.linkedin.com/posts/%
5
+ linkedin_redirect: https://lnkd.in/%
@@ -4,16 +4,19 @@ import time
4
4
  import glob
5
5
  import pandas as pd
6
6
  from urllib.parse import urlparse, urlunparse
7
+ from ohmyscrapper.core import config
7
8
 
8
9
 
9
10
  def get_db_dir():
10
- if not os.path.exists("db"):
11
- os.mkdir("db")
12
- return "db"
11
+ db_folder = config.get_dir("db")
12
+ if not os.path.exists(db_folder):
13
+ os.mkdir(db_folder)
14
+ return db_folder
13
15
 
14
16
 
15
17
  def get_db_path():
16
- return get_db_dir() + "/local.db"
18
+ db_file = config.get_db()
19
+ return os.path.join(get_db_dir(), db_file)
17
20
 
18
21
 
19
22
  def get_db_connection():
@@ -26,7 +29,11 @@ def use_connection(func):
26
29
  def provide_connection(*args, **kwargs):
27
30
  global conn
28
31
  with get_db_connection() as conn:
29
- return func(*args, **kwargs)
32
+ try:
33
+ return func(*args, **kwargs)
34
+ except:
35
+ update_db()
36
+ return func(*args, **kwargs)
30
37
 
31
38
  return provide_connection
32
39
 
@@ -35,7 +42,7 @@ def create_tables(conn):
35
42
 
36
43
  c = conn.cursor()
37
44
  c.execute(
38
- "CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT, h1 TEXT, error TEXT, description TEXT, description_links INTEGER DEFAULT 0, json TEXT, json_ai TEXT, ai_processed INTEGER DEFAULT 0, history INTEGER DEFAULT 0, last_touch DATETIME, created_at DATETIME)"
45
+ "CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT, title TEXT, error TEXT, description TEXT, description_links INTEGER DEFAULT 0, json TEXT, json_ai TEXT, ai_processed INTEGER DEFAULT 0, history INTEGER DEFAULT 0, last_touch DATETIME, created_at DATETIME)"
39
46
  )
40
47
  c.execute(
41
48
  "CREATE TABLE IF NOT EXISTS ai_log (id INTEGER PRIMARY KEY, instructions STRING, response STRING, model STRING, prompt_file STRING, prompt_name STRING, created_at DATETIME)"
@@ -46,12 +53,18 @@ def create_tables(conn):
46
53
  )
47
54
 
48
55
 
49
- def seeds():
50
- add_urls_valid_prefix("https://%.linkedin.com/posts/%", "linkedin_post")
51
- add_urls_valid_prefix("https://lnkd.in/%", "linkedin_redirect")
52
- add_urls_valid_prefix("https://%.linkedin.com/jobs/view/%", "linkedin_job")
53
- add_urls_valid_prefix("https://%.linkedin.com/feed/%", "linkedin_feed")
54
- add_urls_valid_prefix("https://%.linkedin.com/company/%", "linkedin_company")
56
+ def update_db():
57
+ try:
58
+ c = conn.cursor()
59
+ c.execute("ALTER TABLE urls RENAME COLUMN h1 TO title")
60
+ except:
61
+ pass
62
+
63
+
64
+ def seeds(seeds={}):
65
+
66
+ for url_type, url_prefix in seeds.items():
67
+ add_urls_valid_prefix(url_prefix, url_type)
55
68
 
56
69
  return True
57
70
 
@@ -117,7 +130,7 @@ def get_urls_report():
117
130
  SELECT
118
131
  u.id,
119
132
  u.url,
120
- u.h1
133
+ u.title
121
134
  FROM urls u
122
135
  INNER JOIN parent_url p
123
136
  ON u.url = p.parent_url
@@ -126,9 +139,9 @@ def get_urls_report():
126
139
  u.id,
127
140
  u.url_type,
128
141
  u.url,
129
- COALESCE(u.h1, p.h1) as h1,
142
+ COALESCE(u.title, p.title) as title,
130
143
  p.url as parent_url,
131
- p.h1 as parent_h1
144
+ p.title as parent_title
132
145
  FROM urls u
133
146
  LEFT JOIN parents p
134
147
  ON u.parent_url = p.url
@@ -184,12 +197,12 @@ def get_url_like_unclassified(like_condition):
184
197
 
185
198
 
186
199
  @use_connection
187
- def add_url(url, h1=None, parent_url=None):
200
+ def add_url(url, title=None, parent_url=None):
188
201
  url = clean_url(url)
189
202
  c = conn.cursor()
190
203
 
191
- if h1 is not None:
192
- h1 = h1.strip()
204
+ if title is not None:
205
+ title = title.strip()
193
206
 
194
207
  if parent_url is None:
195
208
  parent_url = None
@@ -198,8 +211,8 @@ def add_url(url, h1=None, parent_url=None):
198
211
 
199
212
  if len(get_url_by_url(url)) == 0:
200
213
  c.execute(
201
- "INSERT INTO urls (url, h1, parent_url, created_at, ai_processed, description_links, history) VALUES (?, ?, ?, ?, 0, 0, 0)",
202
- (url, h1, parent_url, int(time.time())),
214
+ "INSERT INTO urls (url, title, parent_url, created_at, ai_processed, description_links, history) VALUES (?, ?, ?, ?, 0, 0, 0)",
215
+ (url, title, parent_url, int(time.time())),
203
216
  )
204
217
  conn.commit()
205
218
 
@@ -238,20 +251,20 @@ def set_url_destiny(url, destiny):
238
251
 
239
252
 
240
253
  @use_connection
241
- def set_url_h1(url, value):
254
+ def set_url_title(url, value):
242
255
  value = str(value).strip()
243
256
  url = clean_url(url)
244
257
  c = conn.cursor()
245
- c.execute("UPDATE urls SET h1 = ? WHERE url = ?", (value, url))
258
+ c.execute("UPDATE urls SET title = ? WHERE url = ?", (value, url))
246
259
  conn.commit()
247
260
 
248
261
 
249
262
  @use_connection
250
- def set_url_h1_by_id(id, value):
263
+ def set_url_title_by_id(id, value):
251
264
  value = str(value).strip()
252
265
 
253
266
  c = conn.cursor()
254
- c.execute("UPDATE urls SET h1 = ? WHERE id = ?", (value, id))
267
+ c.execute("UPDATE urls SET title = ? WHERE id = ?", (value, id))
255
268
  conn.commit()
256
269
 
257
270
 
@@ -426,16 +439,16 @@ def merge_dbs() -> None:
426
439
 
427
440
 
428
441
  @use_connection
429
- def merge_url(url, h1, last_touch, created_at, description, json):
442
+ def merge_url(url, title, last_touch, created_at, description, json):
430
443
  url = clean_url(url)
431
444
  c = conn.cursor()
432
445
 
433
- if h1 is not None:
434
- h1 = h1.strip()
446
+ if title is not None:
447
+ title = title.strip()
435
448
 
436
449
  if len(get_url_by_url(url)) == 0:
437
450
  c.execute(
438
- "INSERT INTO urls (url, h1, last_touch , created_at, history, ai_processed, description_links, description, json) VALUES (?, ?, ?, ?, 1, 0, 0, ? , ?)",
439
- (url, h1, last_touch, created_at, description, json),
451
+ "INSERT INTO urls (url, title, last_touch , created_at, history, ai_processed, description_links, description, json) VALUES (?, ?, ?, ?, 1, 0, 0, ? , ?)",
452
+ (url, title, last_touch, created_at, description, json),
440
453
  )
441
454
  conn.commit()
@@ -1,11 +1,15 @@
1
1
  import ohmyscrapper.models.urls_manager as urls_manager
2
+ from ohmyscrapper.modules import seed
2
3
  import pandas as pd
3
4
  import time
4
5
 
5
6
 
6
7
  def classify_urls(recursive=False):
7
- urls_manager.seeds()
8
8
  df = urls_manager.get_urls_valid_prefix()
9
+ if len(df) == 0:
10
+ seed.seed()
11
+ classify_urls(recursive=recursive)
12
+ return
9
13
 
10
14
  keep_alive = True
11
15
  while keep_alive: