ohmyscrapper 0.2.3__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,31 +3,101 @@ from bs4 import BeautifulSoup
3
3
  import json
4
4
 
5
5
 
6
- def sniff_url(url="https://www.linkedin.com/in/cesardesouzacardoso/", silent=False):
6
+ def sniff_url(
7
+ url="https://www.linkedin.com/in/cesardesouzacardoso/",
8
+ silent=False,
9
+ sniffing_config={},
10
+ ):
11
+ if "metatags" in sniffing_config:
12
+ metatags_to_search = sniffing_config["metatags"]
13
+ else:
14
+ metatags_to_search = [
15
+ "description",
16
+ "og:url",
17
+ "og:title",
18
+ "og:description",
19
+ "og:type",
20
+ "lnkd:url",
21
+ ]
22
+
23
+ if "bodytags" in sniffing_config:
24
+ body_tags_to_search = sniffing_config["bodytags"]
25
+ else:
26
+ body_tags_to_search = {
27
+ "h1": "",
28
+ "h2": "",
29
+ }
30
+
31
+ if type(metatags_to_search) is dict:
32
+ metatags_to_search = list(metatags_to_search.keys())
33
+
34
+ # force clean concatenate without any separator
35
+ if type(body_tags_to_search) is dict:
36
+ body_tags_to_search = list(body_tags_to_search.keys())
37
+
38
+ if type(body_tags_to_search) is list:
39
+ body_tags_to_search = dict.fromkeys(body_tags_to_search, " ")
40
+
7
41
  if not silent:
8
42
  print("checking url:", url)
9
- report_meta_tags = []
10
- tags_to_search = [
11
- "description",
12
- "og:url",
13
- "og:title",
14
- "og:description",
15
- "og:type",
16
- "lnkd:url",
17
- ]
18
43
 
19
44
  r = requests.get(url=url)
20
45
  soup = BeautifulSoup(r.text, "html.parser")
21
46
 
47
+ final_report = {}
48
+ final_report["scrapped-url"] = url
49
+ if len(metatags_to_search) > 0:
50
+ final_report.update(
51
+ _extract_meta_tags(
52
+ soup=soup, silent=silent, metatags_to_search=metatags_to_search
53
+ )
54
+ )
55
+
56
+ if len(body_tags_to_search) > 0:
57
+ final_report.update(
58
+ _extract_text_tags(
59
+ soup=soup, silent=silent, body_tags_to_search=body_tags_to_search
60
+ )
61
+ )
62
+ final_report["a_links"] = _extract_a_tags(soup=soup, silent=silent)
63
+ final_report = _complementary_report(final_report, soup, silent).copy()
64
+ final_report["json"] = json.dumps(final_report)
65
+
66
+ return final_report
67
+
68
+
69
+ def _extract_a_tags(soup, silent):
70
+ a_links = []
71
+ if not silent:
72
+ print("\n\n\n\n---- all <a> links ---")
73
+
74
+ i = 0
75
+ for a_tag in soup.find_all("a"):
76
+ i = i + 1
77
+ a_links.append({"text": a_tag.text, "href": a_tag.get("href")})
78
+ if not silent:
79
+ print("\n-- <a> link", i, "-- ")
80
+ print("target:", a_tag.get("target"))
81
+ print("text:", str(a_tag.text).strip())
82
+ print("href:", a_tag.get("href"))
83
+ print("-------------- ")
84
+ return a_links
85
+
86
+
87
+ def _extract_meta_tags(soup, silent, metatags_to_search):
88
+ valid_meta_tags = {}
22
89
  if not silent:
23
90
  print("\n\n\n\n---- all <meta> tags ---\n")
24
91
  i = 0
25
92
  for meta_tag in soup.find_all("meta"):
26
93
  if (
27
- meta_tag.get("name") in tags_to_search
28
- or meta_tag.get("property") in tags_to_search
94
+ meta_tag.get("name") in metatags_to_search
95
+ or meta_tag.get("property") in metatags_to_search
29
96
  ):
30
- report_meta_tags.append(meta_tag)
97
+ if meta_tag.get("name") is not None:
98
+ valid_meta_tags[meta_tag.get("name")] = meta_tag.get("content")
99
+ elif meta_tag.get("property") is not None:
100
+ valid_meta_tags[meta_tag.get("property")] = meta_tag.get("content")
31
101
  i = i + 1
32
102
  if not silent:
33
103
  print("-- meta tag", i, "--")
@@ -35,54 +105,51 @@ def sniff_url(url="https://www.linkedin.com/in/cesardesouzacardoso/", silent=Fal
35
105
  print("property:", meta_tag.get("property"))
36
106
  print("content:", meta_tag.get("content"))
37
107
  print("---------------- \n")
108
+ return valid_meta_tags
38
109
 
110
+
111
+ def _extract_text_tags(soup, silent, body_tags_to_search):
112
+ valid_text_tags = {}
39
113
  if not silent:
40
- print("\n\n\n\n---- all <a> links ---")
41
- i = 0
42
- for a_tag in soup.find_all("a"):
43
- i = i + 1
44
- print("\n-- a link", i, "-- ")
45
- print("target:", a_tag.get("target"))
46
- print("text:", a_tag.text)
47
- print("href:", a_tag.get("href"))
48
- print("-------------- ")
114
+ print("\n\n\n\n---- all <text> tags ---\n")
115
+ i = 0
116
+ for text_tag, separator in body_tags_to_search.items():
117
+ if len(soup.find_all(text_tag)) > 0:
118
+ valid_text_tags[text_tag] = []
119
+ for obj_tag in soup.find_all(text_tag):
120
+ valid_text_tags[text_tag].append(obj_tag.text.strip())
121
+ valid_text_tags[text_tag] = separator.join(valid_text_tags[text_tag])
122
+ i = i + 1
123
+ if not silent:
124
+ print("-- text tag", i, "--")
125
+ print("name:", text_tag)
126
+ print("separator:", separator)
127
+ print("texts:", valid_text_tags[text_tag])
128
+ print("---------------- \n")
129
+ return valid_text_tags
49
130
 
50
- final_report = {}
51
- final_report["scrapped-url"] = url
52
- if len(soup.find_all("h1")) > 0:
53
- final_report["h1"] = soup.find("h1").text
54
-
55
- for report_meta_tag in report_meta_tags:
56
- if report_meta_tag.get("name") is not None:
57
- final_report[report_meta_tag.get("name")] = report_meta_tag.get("content")
58
- elif report_meta_tag.get("property") is not None:
59
- final_report[report_meta_tag.get("property")] = report_meta_tag.get(
60
- "content"
61
- )
62
131
 
63
- if len(soup.find_all("a")) > 0:
64
- final_report["first-a-link"] = soup.find("a").get("href")
65
- final_report["total-a-links"] = len(soup.find_all("a"))
132
+ def _complementary_report(final_report, soup, silent):
133
+
134
+ if len(final_report["a_links"]) > 0:
135
+ final_report["first-a-link"] = final_report["a_links"][0]["href"]
136
+ final_report["total-a-links"] = len(final_report["a_links"])
66
137
  else:
67
138
  final_report["first-a-link"] = ""
68
139
  final_report["total-a-links"] = 0
69
140
 
70
- if len(soup.find_all("h2")) > 0:
71
- final_report["h2"] = soup.find("h2").text
72
-
73
141
  if len(soup.find_all("meta")) > 0:
74
142
  final_report["total-meta-tags"] = len(soup.find_all("meta"))
75
143
  else:
76
144
  final_report["total-meta-tags"] = 0
77
-
78
- final_report["json"] = json.dumps(final_report)
79
145
  if not silent:
80
146
  print("\n\n\n----report---\n")
81
147
  for key in final_report:
82
- print("* ", key, ":", final_report[key])
148
+ if key != "a_links":
149
+ print("* ", key, ":", final_report[key])
83
150
 
84
151
  return final_report
85
152
 
86
153
 
87
- def get_tags(url):
88
- return sniff_url(url=url, silent=True)
154
+ def get_tags(url, sniffing_config={}):
155
+ return sniff_url(url=url, silent=True, sniffing_config=sniffing_config)
@@ -3,5 +3,5 @@ import ohmyscrapper.models.urls_manager as urls_manager
3
3
 
4
4
  def untouch_all():
5
5
  urls_manager.untouch_all_urls()
6
- print("urls have been untouched")
6
+ print("🙌 urls have been untouched")
7
7
  return
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ohmyscrapper
3
- Version: 0.2.3
4
- Summary: This project aims to create a text-based scraper containing links to create a final PDF with general information about job openings.
3
+ Version: 0.7.0
4
+ Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
5
5
  Author: Cesar Cardoso
6
6
  Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
7
7
  Requires-Dist: beautifulsoup4>=4.14.3
@@ -16,16 +16,17 @@ Requires-Dist: urlextract>=1.9.0
16
16
  Requires-Python: >=3.11
17
17
  Description-Content-Type: text/markdown
18
18
 
19
- # OhMyScrapper - v0.2.3
19
+ # 🐶 OhMyScrapper - v0.7.0
20
20
 
21
- This project aims to create a text-based scraper containing links to create a
22
- final PDF with general information about job openings.
21
+ OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
22
+ final report with general information about job positions.
23
23
 
24
24
  ## Scope
25
25
 
26
26
  - Read texts;
27
- - Extract links;
28
- - Use meta og:tags to extract information;
27
+ - Extract and load urls;
28
+ - Scrapes the urls looking for og:tags and titles;
29
+ - Export a list of links with relevant information;
29
30
 
30
31
  ## Installation
31
32
 
@@ -50,7 +51,7 @@ uvx ohmyscrapper --version
50
51
 
51
52
  OhMyScrapper works in 3 stages:
52
53
 
53
- 1. It collects and loads urls from a text (by default `input/_chat.txt`) in a database;
54
+ 1. It collects and loads urls from a text in a database;
54
55
  2. It scraps/access the collected urls and read what is relevant. If it finds new urls, they are collected as well;
55
56
  3. Export a list of urls in CSV files;
56
57
 
@@ -58,7 +59,7 @@ You can do 3 stages with the command:
58
59
  ```shell
59
60
  ohmyscrapper start
60
61
  ```
61
- > Remember to add your text file in the folder `/input` with the name `_chat.txt`!
62
+ > Remember to add your text file in the folder `/input` with the name that finishes with `.txt`!
62
63
 
63
64
  You will find the exported files in the folder `/output` like this:
64
65
  - `/output/report.csv`
@@ -70,18 +71,23 @@ You will find the exported files in the folder `/output` like this:
70
71
 
71
72
  ### BUT: if you want to do step by step, here it is:
72
73
 
73
- First we load a text file you would like to look for urls, the idea here is to
74
- use the whatsapp history, but it works with any txt file.
74
+ First we load a text file you would like to look for urls. It it works with any txt file.
75
75
 
76
- The default file is `input/_chat.txt`. If you have the default file you just use
77
- the command `load`:
76
+ The default folder is `/input`. Put one or more text (finished with `.txt`) files
77
+ in this folder and use the command `load`:
78
78
  ```shell
79
79
  ohmyscrapper load
80
80
  ```
81
- or, if you have another file, just use the argument `-file` like this:
81
+ or, if you have another file in a different folder, just use the argument `-input` like this:
82
82
  ```shell
83
- ohmyscrapper load -file=my-text-file.txt
83
+ ohmyscrapper load -input=my-text-file.txt
84
84
  ```
85
+ In this case, you can add an url directly to the database, like this:
86
+ ```shell
87
+ ohmyscrapper load -input=https://cesarcardoso.cc/
88
+ ```
89
+ That will append the last url in the database to be scraped.
90
+
85
91
  That will create a database if it doesn't exist and store every url the oh-my-scrapper
86
92
  find. After that, let's scrap the urls with the command `scrap-urls`:
87
93
 
@@ -0,0 +1,21 @@
1
+ ohmyscrapper/__init__.py,sha256=w5Ty9eszf8tEv72IQrFov0YbZWMqsraq448xhX3YGQs,6493
2
+ ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
3
+ ohmyscrapper/core/config.py,sha256=i_RA-zReNQIWWmsFar85qzRUqdqvTFMPeCP7Hya7ltU,2996
4
+ ohmyscrapper/core/config_files.py,sha256=KC3yChTnlclclU9EKTqFBoAu9p6XdOKuegub5NPYDDY,2434
5
+ ohmyscrapper/core/default_files/config.yaml,sha256=bgPBVlze2tOCbyrA47h_5BJ35UsXnqsjQszzy0vn-Pw,248
6
+ ohmyscrapper/core/default_files/url_sniffing.yaml,sha256=MKdVR5HQ1i2yTRw2ijzxPSmIyhUno_R4L2k17r3EBBc,417
7
+ ohmyscrapper/core/default_files/url_types.yaml,sha256=20kvv8_iWRT-pLa014RXYpAmPSonn6tDnG302rx7l-o,228
8
+ ohmyscrapper/models/urls_manager.py,sha256=FC1j72M1gzNwC_PzPqnew986b-BI6s7zUv8Z7HiM1M0,11849
9
+ ohmyscrapper/modules/classify_urls.py,sha256=GhiosAQUITy1DQe_PksYV9QRKVTgpkSE28dkutzbWVA,1038
10
+ ohmyscrapper/modules/load_txt.py,sha256=dNkUZ2ehBiPx-q4fPczRiHFvnpzCrjeycFtexhWGmEE,3967
11
+ ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
12
+ ohmyscrapper/modules/process_with_ai.py,sha256=kl39Jzl-PUwh6AfmTZ9SLFUYs9Sk4biqgt8rNz3X1FA,7255
13
+ ohmyscrapper/modules/scrap_urls.py,sha256=CNoEC-d1r-u4qxnEVimm4ctP6MJGdU8y8VI2Nx0bBdM,6033
14
+ ohmyscrapper/modules/seed.py,sha256=qDUE7TWx9iNQEzqThK4p7g8pTZjdpkmoqI8kOo_zdtk,983
15
+ ohmyscrapper/modules/show.py,sha256=jsAs4g8ouA9wymkBfkDCbpVWKD-m_20uKG-m1cZAUGA,3877
16
+ ohmyscrapper/modules/sniff_url.py,sha256=zJ2Uox2aUdQibL4UFLxg3t7GqJ7WwWEl0q3QSUbMEbc,4960
17
+ ohmyscrapper/modules/untouch_all.py,sha256=DAwWYfqMFifHPtFCxSamu0AxHCgk6aJbTnBy6wLucXM,167
18
+ ohmyscrapper-0.7.0.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
19
+ ohmyscrapper-0.7.0.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
20
+ ohmyscrapper-0.7.0.dist-info/METADATA,sha256=Doakf4oDT6oskPGdSlEoRJHBxUmm9FhWaHfDlNIfNuM,4096
21
+ ohmyscrapper-0.7.0.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- ohmyscrapper/__init__.py,sha256=6k-fyuKkTefy5lJiRJFDP7wfkGhYRC8qgdfYlNB_3sk,5841
2
- ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
3
- ohmyscrapper/models/urls_manager.py,sha256=xKql_xdwfRwgpMyriuIrZ0Srz4gYQGMfWClEWpGRtNE,11183
4
- ohmyscrapper/modules/classify_urls.py,sha256=eyHtTHDZp2pGmYw_X-7LrbeVOgDPcRQdhu0oEuwQtKA,743
5
- ohmyscrapper/modules/load_txt.py,sha256=mL60OGsh-R80P88vxyqvfBEFag9yhSFFbg5pwtu1f90,889
6
- ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
7
- ohmyscrapper/modules/process_with_ai.py,sha256=TpumucIVNZulKOw2idy4hD3vG5IhG5pbhyJImYFP8g0,6844
8
- ohmyscrapper/modules/scrap_urls.py,sha256=KQVs3R03X80hmvvJAU1SqnNhwXEeVV99WlN8TxSKqA8,6097
9
- ohmyscrapper/modules/seed.py,sha256=KeTSbmTdNTkVCtzk9iQmeuEqB0kG-rTZJb2a1WdROL4,129
10
- ohmyscrapper/modules/show.py,sha256=u0L9uxgU8Xt_-myA3r7byuOmnX_-2gkpTtXWkXon1ns,3572
11
- ohmyscrapper/modules/sniff_url.py,sha256=jQDc7aSimuOOedw2fSXZlf6_o0OqQHOr6NsWb4n0XgI,2720
12
- ohmyscrapper/modules/untouch_all.py,sha256=E1U9e3sOG7suzc8ZTWcYiQQo9mPmLJ0piXdXUjFLEd4,162
13
- ohmyscrapper-0.2.3.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
14
- ohmyscrapper-0.2.3.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
15
- ohmyscrapper-0.2.3.dist-info/METADATA,sha256=uwthvf7vwhb6H14KdbPLibzp1c3PaVmcJIYePmV8cRc,3832
16
- ohmyscrapper-0.2.3.dist-info/RECORD,,