ohmyscrapper 0.7.0__py3-none-any.whl → 0.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ohmyscrapper/__init__.py CHANGED
@@ -20,7 +20,7 @@ from ohmyscrapper.core.config import update
20
20
 
21
21
  def main():
22
22
  parser = argparse.ArgumentParser(prog="ohmyscrapper")
23
- parser.add_argument("--version", action="version", version="%(prog)s v0.7.0")
23
+ parser.add_argument("--version", action="version", version="%(prog)s v0.7.4")
24
24
 
25
25
  update()
26
26
  subparsers = parser.add_subparsers(dest="command", help="Available commands")
@@ -28,6 +28,9 @@ def main():
28
28
  "start",
29
29
  help="Make the entire process of 📦 loading, 🐶 scraping and 📜🖋️ exporting with the default configuration.",
30
30
  )
31
+ start_parser.add_argument(
32
+ "-input", default=None, help="File/Folder path or url for pre-loading."
33
+ )
31
34
 
32
35
  start_parser.add_argument(
33
36
  "--ai",
@@ -50,6 +53,14 @@ def main():
50
53
  help="Add all `url_types` from the bank to the `/ohmyscrapper/url_types.yaml` file.",
51
54
  action="store_true",
52
55
  )
56
+
57
+ seed_parser.add_argument(
58
+ "--reset",
59
+ default=False,
60
+ help="Reset all `url_types`.",
61
+ action="store_true",
62
+ )
63
+
53
64
  untouch_parser = subparsers.add_parser(
54
65
  "untouch-all", help="Untouch all urls. That resets classification"
55
66
  )
@@ -85,6 +96,9 @@ def main():
85
96
  scrap_urls_parser.add_argument(
86
97
  "--verbose", default=False, help="Run in verbose mode", action="store_true"
87
98
  )
99
+ scrap_urls_parser.add_argument(
100
+ "-input", default=None, help="File/Folder path or url for pre-loading."
101
+ )
88
102
 
89
103
  sniff_url_parser = subparsers.add_parser("sniff-url", help="🐕 Sniff/Check url")
90
104
  sniff_url_parser.add_argument(
@@ -131,7 +145,7 @@ def main():
131
145
  if args.export:
132
146
  export_url_types_to_file()
133
147
  else:
134
- seed()
148
+ seed(args.reset)
135
149
  return
136
150
 
137
151
  if args.command == "untouch-all":
@@ -143,6 +157,9 @@ def main():
143
157
  return
144
158
 
145
159
  if args.command == "scrap-urls":
160
+ if args.input != None:
161
+ load_txt(file_name=args.input, verbose=args.verbose)
162
+
146
163
  scrap_urls(
147
164
  recursive=args.recursive,
148
165
  ignore_valid_prefix=args.ignore_type,
@@ -182,7 +199,12 @@ def main():
182
199
  return
183
200
 
184
201
  if args.command == "start":
185
- load_txt()
202
+ seed()
203
+ if args.input != None:
204
+ load_txt(file_name=args.input)
205
+ else:
206
+ load_txt()
207
+
186
208
  scrap_urls(
187
209
  recursive=True,
188
210
  ignore_valid_prefix=True,
@@ -39,6 +39,12 @@ def get_ai(param):
39
39
  )
40
40
 
41
41
 
42
+ def get_sniffing(param):
43
+ return config_files.get_param(
44
+ parent_param="sniffing", param=param, default_app_dir=default_app_dir
45
+ )
46
+
47
+
42
48
  def load_config(force_default=False):
43
49
  config_file_name = "config.yaml"
44
50
  config_params = config_files.create_and_read_config_file(
@@ -4,14 +4,29 @@ import yaml
4
4
 
5
5
  def create_and_read_config_file(file_name, default_app_dir, force_default=False):
6
6
  config_file = config_file_path(file_name, default_app_dir)
7
+ default_config_params = _get_default_file(default_file=file_name)
7
8
  if force_default or not os.path.exists(config_file):
8
- config_params = _get_default_file(default_file=file_name)
9
9
  overwrite_config_file(
10
- data=config_params, file_name=file_name, default_app_dir=default_app_dir
10
+ data=default_config_params,
11
+ file_name=file_name,
12
+ default_app_dir=default_app_dir,
11
13
  )
14
+ config_params = default_config_params
12
15
  else:
13
16
  with open(config_file, "r") as f:
14
17
  config_params = yaml.safe_load(f.read())
18
+ if complete_config_file(
19
+ config_params=config_params,
20
+ default_config_params=default_config_params,
21
+ file_name=file_name,
22
+ default_app_dir=default_app_dir,
23
+ ):
24
+ config_params = create_and_read_config_file(
25
+ file_name=file_name,
26
+ default_app_dir=default_app_dir,
27
+ force_default=force_default,
28
+ )
29
+
15
30
  if config_params is None:
16
31
  config_params = create_and_read_config_file(
17
32
  file_name=file_name, default_app_dir=default_app_dir, force_default=True
@@ -19,6 +34,18 @@ def create_and_read_config_file(file_name, default_app_dir, force_default=False)
19
34
  return config_params
20
35
 
21
36
 
37
+ def complete_config_file(
38
+ config_params, default_config_params, file_name, default_app_dir
39
+ ):
40
+ has_updated = False
41
+ for key, values in default_config_params.items():
42
+ if key not in config_params.keys():
43
+ has_updated = True
44
+ data = {key: values}
45
+ append_config_file(data, file_name, default_app_dir)
46
+ return has_updated
47
+
48
+
22
49
  def overwrite_config_file(data, file_name, default_app_dir):
23
50
  config_file = config_file_path(file_name, default_app_dir)
24
51
  with open(config_file, "+w") as f:
@@ -14,3 +14,6 @@ default_files:
14
14
 
15
15
  ai:
16
16
  default_prompt_file: prompt.md
17
+
18
+ sniffing:
19
+ timeout: 10
@@ -23,3 +23,7 @@ linkedin_redirect:
23
23
  og:url: url_destiny
24
24
  atags:
25
25
  first-tag-as-url_destiny: 5
26
+
27
+ read_all_a_tags:
28
+ atags:
29
+ load_atags: True
@@ -69,6 +69,14 @@ def seeds(seeds={}):
69
69
  return True
70
70
 
71
71
 
72
+ @use_connection
73
+ def reset_seeds():
74
+ sql = "DELETE FROM urls_valid_prefix"
75
+ c = conn.cursor()
76
+ c.execute(sql)
77
+ conn.commit()
78
+
79
+
72
80
  @use_connection
73
81
  def add_urls_valid_prefix(url_prefix, url_type):
74
82
 
@@ -198,6 +206,8 @@ def get_url_like_unclassified(like_condition):
198
206
 
199
207
  @use_connection
200
208
  def add_url(url, title=None, parent_url=None):
209
+ if url[:1] == "/":
210
+ return
201
211
  url = clean_url(url)
202
212
  c = conn.cursor()
203
213
 
@@ -340,7 +350,9 @@ def set_url_error(url, value):
340
350
  @use_connection
341
351
  def set_url_type_by_id(url_id, url_type):
342
352
  c = conn.cursor()
343
- c.execute(f"UPDATE urls SET url_type = '{url_type}' WHERE id = {url_id}")
353
+ c.execute(
354
+ f"UPDATE urls SET url_type = '{url_type}', last_touch = NULL WHERE id = {url_id}"
355
+ )
344
356
  conn.commit()
345
357
 
346
358
 
@@ -392,8 +404,10 @@ def touch_url(url):
392
404
  @use_connection
393
405
  def untouch_url(url):
394
406
  url = clean_url(url)
407
+ url = str(url.strip())
408
+
395
409
  c = conn.cursor()
396
- c.execute("UPDATE urls SET last_touch = NULL WHERE url = ?", (url))
410
+ c.execute(f"UPDATE urls SET last_touch = NULL, url_type = NULL WHERE url = '{url}'")
397
411
  conn.commit()
398
412
 
399
413
 
@@ -19,14 +19,16 @@ def load_txt(file_name="input", verbose=False):
19
19
 
20
20
  text_file_content = ""
21
21
  if file_name is not None and not os.path.isdir(file_name):
22
- print(f"📖 reading file `{file_name}`... ")
23
22
  if not os.path.exists(file_name):
24
23
  if file_name.startswith("https://") or file_name.startswith("http://"):
24
+ print(f"📖 reading url `{file_name}`... ")
25
25
  text_file_content = " " + file_name + " "
26
+ urls_manager.untouch_url(url=file_name)
26
27
  else:
27
28
  print(f"\n file `{file_name}` not found.")
28
29
  return
29
30
  else:
31
+ print(f"📖 reading file `{file_name}`... ")
30
32
  text_file_content = _increment_file_name(
31
33
  text_file_content=text_file_content, file_name=file_name
32
34
  )
@@ -51,13 +53,15 @@ def load_txt(file_name="input", verbose=False):
51
53
  file_name=os.path.join(dir_files, text_files[0]),
52
54
  )
53
55
  else:
54
- print("\nChoose a text file. Use `*` for process all and `q` to quit:")
56
+ print("\nFiles list:")
55
57
  for index, file in enumerate(text_files):
56
58
  print(f"[{index}]:", os.path.join(dir_files, file))
57
59
 
58
60
  text_file_option = -1
59
61
  while text_file_option < 0 or text_file_option >= len(text_files):
60
- text_file_option = input("Enter the file number: ")
62
+ text_file_option = input(
63
+ "Choose a text file. Use `*` for process all and `q` to quit. Enter the file number: "
64
+ )
61
65
  if text_file_option == "*":
62
66
  for file in text_files:
63
67
  text_file_content = _increment_file_name(
@@ -21,12 +21,12 @@ def scrap_url(url, verbose=False):
21
21
 
22
22
  if url_type not in sniffing_config:
23
23
  default_type_sniffing = {
24
- "bodytags": [{"h1": "title"}],
25
- "metatags": [
26
- {"og:title": "title"},
27
- {"og:description": "description"},
28
- {"description": "description"},
29
- ],
24
+ "bodytags": {"h1": "title"},
25
+ "metatags": {
26
+ "og:title": "title",
27
+ "og:description": "description",
28
+ "description": "description",
29
+ },
30
30
  }
31
31
  config.append_url_sniffing({url_type: default_type_sniffing})
32
32
  sniffing_config = config.get_url_sniffing()
@@ -104,6 +104,12 @@ def process_sniffed_url(url_report, url, sniffing_config, verbose=False):
104
104
  ):
105
105
  if "first-a-link" in url_report.keys():
106
106
  db_fields["url_destiny"] = url_report["first-a-link"]
107
+ if (
108
+ "atags" in sniffing_config.keys()
109
+ and "load_links" in sniffing_config["atags"].keys()
110
+ ):
111
+ for a_link in url_report["a_links"]:
112
+ urls_manager.add_url(url=a_link["href"], parent_url=url["url"])
107
113
 
108
114
  if db_fields["title"] is not None:
109
115
  urls_manager.set_url_title(url=url["url"], value=db_fields["title"])
@@ -2,7 +2,10 @@ import ohmyscrapper.models.urls_manager as urls_manager
2
2
  from ohmyscrapper.core import config
3
3
 
4
4
 
5
- def seed():
5
+ def seed(reset=False):
6
+ if reset:
7
+ urls_manager.reset_seeds()
8
+
6
9
  if not config.url_types_file_exists():
7
10
  db_url_types = urls_manager.get_urls_valid_prefix()
8
11
  if len(db_url_types) > 0:
@@ -1,6 +1,7 @@
1
1
  import requests
2
2
  from bs4 import BeautifulSoup
3
3
  import json
4
+ from ohmyscrapper.core import config
4
5
 
5
6
 
6
7
  def sniff_url(
@@ -8,6 +9,8 @@ def sniff_url(
8
9
  silent=False,
9
10
  sniffing_config={},
10
11
  ):
12
+ final_report = {}
13
+ final_report["error"] = None
11
14
  if "metatags" in sniffing_config:
12
15
  metatags_to_search = sniffing_config["metatags"]
13
16
  else:
@@ -41,10 +44,17 @@ def sniff_url(
41
44
  if not silent:
42
45
  print("checking url:", url)
43
46
 
44
- r = requests.get(url=url)
45
- soup = BeautifulSoup(r.text, "html.parser")
47
+ try:
48
+ r = requests.get(url=url, timeout=config.get_sniffing("timeout"))
49
+ soup = BeautifulSoup(r.text, "html.parser")
50
+ except requests.exceptions.ReadTimeout:
51
+ url_domain = url.split("/")[2]
52
+ final_report["error"] = (
53
+ f"!!! timeout (10 seconds) while checking the url with domain: `{url_domain}` !!!"
54
+ )
55
+ print(f"\n\n{final_report['error']}\n\n")
56
+ soup = BeautifulSoup("", "html.parser")
46
57
 
47
- final_report = {}
48
58
  final_report["scrapped-url"] = url
49
59
  if len(metatags_to_search) > 0:
50
60
  final_report.update(
@@ -119,13 +129,13 @@ def _extract_text_tags(soup, silent, body_tags_to_search):
119
129
  for obj_tag in soup.find_all(text_tag):
120
130
  valid_text_tags[text_tag].append(obj_tag.text.strip())
121
131
  valid_text_tags[text_tag] = separator.join(valid_text_tags[text_tag])
122
- i = i + 1
123
- if not silent:
124
- print("-- text tag", i, "--")
125
- print("name:", text_tag)
126
- print("separator:", separator)
127
- print("texts:", valid_text_tags[text_tag])
128
- print("---------------- \n")
132
+ i = i + 1
133
+ if not silent:
134
+ print("-- text tag", i, "--")
135
+ print("name:", text_tag)
136
+ print("separator:", separator)
137
+ print("texts:", valid_text_tags[text_tag])
138
+ print("---------------- \n")
129
139
  return valid_text_tags
130
140
 
131
141
 
@@ -1,9 +1,10 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: ohmyscrapper
3
- Version: 0.7.0
3
+ Version: 0.7.4
4
4
  Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
5
5
  Author: Cesar Cardoso
6
6
  Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
7
+ License-Expression: MIT
7
8
  Requires-Dist: beautifulsoup4>=4.14.3
8
9
  Requires-Dist: google-genai>=1.55.0
9
10
  Requires-Dist: markdown>=3.10
@@ -14,9 +15,11 @@ Requires-Dist: requests>=2.32.5
14
15
  Requires-Dist: rich>=14.2.0
15
16
  Requires-Dist: urlextract>=1.9.0
16
17
  Requires-Python: >=3.11
18
+ Project-URL: Changelog, https://github.com/bouli/ohmyscrapper/releases/latest
19
+ Project-URL: Repository, https://github.com/bouli/ohmyscrapper
17
20
  Description-Content-Type: text/markdown
18
21
 
19
- # 🐶 OhMyScrapper - v0.7.0
22
+ # 🐶 OhMyScrapper - v0.7.4
20
23
 
21
24
  OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
22
25
  final report with general information about job positions.
@@ -0,0 +1,21 @@
1
+ ohmyscrapper/__init__.py,sha256=x3wLMhIU744W9DRtXoTrPpWghb7UdC3UJSYZh_gpzlw,7095
2
+ ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
3
+ ohmyscrapper/core/config.py,sha256=aaSLxk6Fuzp88EMax6MAOX3WszH4OfYLz_dJoXlu0ME,3142
4
+ ohmyscrapper/core/config_files.py,sha256=C79-Vgz1E5_jUWtob-yrCyBxsqWEXxqPI_r6TL7D1_Q,3314
5
+ ohmyscrapper/core/default_files/config.yaml,sha256=gi8tqhSumQYJIl8QDisJ6eaib2tdcBNT-GFU-e6Dtns,273
6
+ ohmyscrapper/core/default_files/url_sniffing.yaml,sha256=RU5GYWmC1PdBl4nn7HUfRBwuXz8Rlap75d4W3zWDzPM,465
7
+ ohmyscrapper/core/default_files/url_types.yaml,sha256=20kvv8_iWRT-pLa014RXYpAmPSonn6tDnG302rx7l-o,228
8
+ ohmyscrapper/models/urls_manager.py,sha256=k0N1If4YoRUWHX80OyBNEeJNIzDROc2ur6j8q2OBlqo,12103
9
+ ohmyscrapper/modules/classify_urls.py,sha256=GhiosAQUITy1DQe_PksYV9QRKVTgpkSE28dkutzbWVA,1038
10
+ ohmyscrapper/modules/load_txt.py,sha256=pkWBIdh6vORPfENDZ6wGM89vswnOnc1flqKfkLs9RD8,4138
11
+ ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
12
+ ohmyscrapper/modules/process_with_ai.py,sha256=kl39Jzl-PUwh6AfmTZ9SLFUYs9Sk4biqgt8rNz3X1FA,7255
13
+ ohmyscrapper/modules/scrap_urls.py,sha256=uN5j0dychVMGu7n1rcpYdba4sqc47ssyCn0tVaiz-Ic,6264
14
+ ohmyscrapper/modules/seed.py,sha256=hHEGSoPXsmclTaRPeIcK2oC1Xpg3_JqBv_YFMD0m5Jw,1044
15
+ ohmyscrapper/modules/show.py,sha256=jsAs4g8ouA9wymkBfkDCbpVWKD-m_20uKG-m1cZAUGA,3877
16
+ ohmyscrapper/modules/sniff_url.py,sha256=1QnxEdCWLjLh0uM72dlPzst64qglqg2MHA_xYlNcLSA,5435
17
+ ohmyscrapper/modules/untouch_all.py,sha256=DAwWYfqMFifHPtFCxSamu0AxHCgk6aJbTnBy6wLucXM,167
18
+ ohmyscrapper-0.7.4.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
19
+ ohmyscrapper-0.7.4.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
20
+ ohmyscrapper-0.7.4.dist-info/METADATA,sha256=CVE8WUcraUtONy9UVIU0y8Y7wjsk4zEmMVfpA_al1CU,4261
21
+ ohmyscrapper-0.7.4.dist-info/RECORD,,
@@ -1,21 +0,0 @@
1
- ohmyscrapper/__init__.py,sha256=w5Ty9eszf8tEv72IQrFov0YbZWMqsraq448xhX3YGQs,6493
2
- ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
3
- ohmyscrapper/core/config.py,sha256=i_RA-zReNQIWWmsFar85qzRUqdqvTFMPeCP7Hya7ltU,2996
4
- ohmyscrapper/core/config_files.py,sha256=KC3yChTnlclclU9EKTqFBoAu9p6XdOKuegub5NPYDDY,2434
5
- ohmyscrapper/core/default_files/config.yaml,sha256=bgPBVlze2tOCbyrA47h_5BJ35UsXnqsjQszzy0vn-Pw,248
6
- ohmyscrapper/core/default_files/url_sniffing.yaml,sha256=MKdVR5HQ1i2yTRw2ijzxPSmIyhUno_R4L2k17r3EBBc,417
7
- ohmyscrapper/core/default_files/url_types.yaml,sha256=20kvv8_iWRT-pLa014RXYpAmPSonn6tDnG302rx7l-o,228
8
- ohmyscrapper/models/urls_manager.py,sha256=FC1j72M1gzNwC_PzPqnew986b-BI6s7zUv8Z7HiM1M0,11849
9
- ohmyscrapper/modules/classify_urls.py,sha256=GhiosAQUITy1DQe_PksYV9QRKVTgpkSE28dkutzbWVA,1038
10
- ohmyscrapper/modules/load_txt.py,sha256=dNkUZ2ehBiPx-q4fPczRiHFvnpzCrjeycFtexhWGmEE,3967
11
- ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
12
- ohmyscrapper/modules/process_with_ai.py,sha256=kl39Jzl-PUwh6AfmTZ9SLFUYs9Sk4biqgt8rNz3X1FA,7255
13
- ohmyscrapper/modules/scrap_urls.py,sha256=CNoEC-d1r-u4qxnEVimm4ctP6MJGdU8y8VI2Nx0bBdM,6033
14
- ohmyscrapper/modules/seed.py,sha256=qDUE7TWx9iNQEzqThK4p7g8pTZjdpkmoqI8kOo_zdtk,983
15
- ohmyscrapper/modules/show.py,sha256=jsAs4g8ouA9wymkBfkDCbpVWKD-m_20uKG-m1cZAUGA,3877
16
- ohmyscrapper/modules/sniff_url.py,sha256=zJ2Uox2aUdQibL4UFLxg3t7GqJ7WwWEl0q3QSUbMEbc,4960
17
- ohmyscrapper/modules/untouch_all.py,sha256=DAwWYfqMFifHPtFCxSamu0AxHCgk6aJbTnBy6wLucXM,167
18
- ohmyscrapper-0.7.0.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
19
- ohmyscrapper-0.7.0.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
20
- ohmyscrapper-0.7.0.dist-info/METADATA,sha256=Doakf4oDT6oskPGdSlEoRJHBxUmm9FhWaHfDlNIfNuM,4096
21
- ohmyscrapper-0.7.0.dist-info/RECORD,,