ohmyscrapper 0.2.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,8 +6,11 @@ import json
6
6
  def sniff_url(url="https://www.linkedin.com/in/cesardesouzacardoso/", silent=False):
7
7
  if not silent:
8
8
  print("checking url:", url)
9
- report_meta_tags = []
10
- tags_to_search = [
9
+
10
+ r = requests.get(url=url)
11
+ soup = BeautifulSoup(r.text, "html.parser")
12
+
13
+ metatags_to_search = [
11
14
  "description",
12
15
  "og:url",
13
16
  "og:title",
@@ -16,18 +19,62 @@ def sniff_url(url="https://www.linkedin.com/in/cesardesouzacardoso/", silent=Fal
16
19
  "lnkd:url",
17
20
  ]
18
21
 
19
- r = requests.get(url=url)
20
- soup = BeautifulSoup(r.text, "html.parser")
22
+ text_tags_to_search = {
23
+ "h1": "",
24
+ "h2": "|",
25
+ }
26
+
27
+ final_report = {}
28
+ final_report["scrapped-url"] = url
29
+ final_report.update(
30
+ _extract_meta_tags(
31
+ soup=soup, silent=silent, metatags_to_search=metatags_to_search
32
+ )
33
+ )
34
+ final_report.update(
35
+ _extract_text_tags(
36
+ soup=soup, silent=silent, text_tags_to_search=text_tags_to_search
37
+ )
38
+ )
39
+ final_report["a_links"] = _extract_a_tags(soup=soup, silent=silent)
40
+ final_report = _complementary_report(final_report, soup, silent).copy()
41
+ final_report["json"] = json.dumps(final_report)
42
+
43
+ return final_report
44
+
45
+
46
+ def _extract_a_tags(soup, silent):
47
+ a_links = []
48
+ if not silent:
49
+ print("\n\n\n\n---- all <a> links ---")
50
+
51
+ i = 0
52
+ for a_tag in soup.find_all("a"):
53
+ i = i + 1
54
+ a_links.append({"text": a_tag.text, "href": a_tag.get("href")})
55
+ if not silent:
56
+ print("\n-- <a> link", i, "-- ")
57
+ print("target:", a_tag.get("target"))
58
+ print("text:", str(a_tag.text).strip())
59
+ print("href:", a_tag.get("href"))
60
+ print("-------------- ")
61
+ return a_links
21
62
 
63
+
64
+ def _extract_meta_tags(soup, silent, metatags_to_search):
65
+ valid_meta_tags = {}
22
66
  if not silent:
23
67
  print("\n\n\n\n---- all <meta> tags ---\n")
24
68
  i = 0
25
69
  for meta_tag in soup.find_all("meta"):
26
70
  if (
27
- meta_tag.get("name") in tags_to_search
28
- or meta_tag.get("property") in tags_to_search
71
+ meta_tag.get("name") in metatags_to_search
72
+ or meta_tag.get("property") in metatags_to_search
29
73
  ):
30
- report_meta_tags.append(meta_tag)
74
+ if meta_tag.get("name") is not None:
75
+ valid_meta_tags[meta_tag.get("name")] = meta_tag.get("content")
76
+ elif meta_tag.get("property") is not None:
77
+ valid_meta_tags[meta_tag.get("property")] = meta_tag.get("content")
31
78
  i = i + 1
32
79
  if not silent:
33
80
  print("-- meta tag", i, "--")
@@ -35,51 +82,48 @@ def sniff_url(url="https://www.linkedin.com/in/cesardesouzacardoso/", silent=Fal
35
82
  print("property:", meta_tag.get("property"))
36
83
  print("content:", meta_tag.get("content"))
37
84
  print("---------------- \n")
85
+ return valid_meta_tags
86
+
38
87
 
88
+ def _extract_text_tags(soup, silent, text_tags_to_search):
89
+ valid_text_tags = {}
39
90
  if not silent:
40
- print("\n\n\n\n---- all <a> links ---")
41
- i = 0
42
- for a_tag in soup.find_all("a"):
43
- i = i + 1
44
- print("\n-- a link", i, "-- ")
45
- print("target:", a_tag.get("target"))
46
- print("text:", a_tag.text)
47
- print("href:", a_tag.get("href"))
48
- print("-------------- ")
91
+ print("\n\n\n\n---- all <text> tags ---\n")
92
+ i = 0
93
+ for text_tag, separator in text_tags_to_search.items():
94
+ if len(soup.find_all(text_tag)) > 0:
95
+ valid_text_tags[text_tag] = []
96
+ for obj_tag in soup.find_all(text_tag):
97
+ valid_text_tags[text_tag].append(obj_tag.text.strip())
98
+ valid_text_tags[text_tag] = separator.join(valid_text_tags[text_tag])
99
+ i = i + 1
100
+ if not silent:
101
+ print("-- text tag", i, "--")
102
+ print("name:", text_tag)
103
+ print("separator:", separator)
104
+ print("texts:", valid_text_tags[text_tag])
105
+ print("---------------- \n")
106
+ return valid_text_tags
49
107
 
50
- final_report = {}
51
- final_report["scrapped-url"] = url
52
- if len(soup.find_all("h1")) > 0:
53
- final_report["h1"] = soup.find("h1").text
54
-
55
- for report_meta_tag in report_meta_tags:
56
- if report_meta_tag.get("name") is not None:
57
- final_report[report_meta_tag.get("name")] = report_meta_tag.get("content")
58
- elif report_meta_tag.get("property") is not None:
59
- final_report[report_meta_tag.get("property")] = report_meta_tag.get(
60
- "content"
61
- )
62
-
63
- if len(soup.find_all("a")) > 0:
64
- final_report["first-a-link"] = soup.find("a").get("href")
65
- final_report["total-a-links"] = len(soup.find_all("a"))
108
+
109
+ def _complementary_report(final_report, soup, silent):
110
+
111
+ if len(final_report["a_links"]) > 0:
112
+ final_report["first-a-link"] = final_report["a_links"][0]["href"]
113
+ final_report["total-a-links"] = len(final_report["a_links"])
66
114
  else:
67
115
  final_report["first-a-link"] = ""
68
116
  final_report["total-a-links"] = 0
69
117
 
70
- if len(soup.find_all("h2")) > 0:
71
- final_report["h2"] = soup.find("h2").text
72
-
73
118
  if len(soup.find_all("meta")) > 0:
74
119
  final_report["total-meta-tags"] = len(soup.find_all("meta"))
75
120
  else:
76
121
  final_report["total-meta-tags"] = 0
77
-
78
- final_report["json"] = json.dumps(final_report)
79
122
  if not silent:
80
123
  print("\n\n\n----report---\n")
81
124
  for key in final_report:
82
- print("* ", key, ":", final_report[key])
125
+ if key != "a_links":
126
+ print("* ", key, ":", final_report[key])
83
127
 
84
128
  return final_report
85
129
 
@@ -3,5 +3,5 @@ import ohmyscrapper.models.urls_manager as urls_manager
3
3
 
4
4
  def untouch_all():
5
5
  urls_manager.untouch_all_urls()
6
- print("urls have been untouched")
6
+ print("🙌 urls have been untouched")
7
7
  return
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ohmyscrapper
3
- Version: 0.2.1
4
- Summary: This project aims to create a text-based scraper containing links to create a final PDF with general information about job openings.
5
- Author: Cesar Cardoso gh@bouli
6
- Author-email: Cesar Cardoso gh@bouli <hello@cesarcardoso.cc>
3
+ Version: 0.6.1
4
+ Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
5
+ Author: Cesar Cardoso
6
+ Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
7
7
  Requires-Dist: beautifulsoup4>=4.14.3
8
8
  Requires-Dist: google-genai>=1.55.0
9
9
  Requires-Dist: markdown>=3.10
@@ -16,39 +16,50 @@ Requires-Dist: urlextract>=1.9.0
16
16
  Requires-Python: >=3.11
17
17
  Description-Content-Type: text/markdown
18
18
 
19
- # OhMyScrapper - v0.2.1
19
+ # 🐶 OhMyScrapper - v0.6.1
20
20
 
21
- This project aims to create a text-based scraper containing links to create a
22
- final PDF with general information about job openings.
23
-
24
- > This project is using [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer) by default.
21
+ OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
22
+ final report with general information about job positions.
25
23
 
26
24
  ## Scope
27
25
 
28
26
  - Read texts;
29
- - Extract links;
30
- - Use meta og:tags to extract information;
27
+ - Extract and load urls;
28
+ - Scrapes the urls looking for og:tags and titles;
29
+ - Export a list of links with relevant information;
31
30
 
32
31
  ## Installation
33
32
 
33
+ You can install directly in your `pip`:
34
+ ```shell
35
+ pip install ohmyscrapper
36
+ ```
37
+
34
38
  I recomend to use the [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer), so you can just use the command bellow and everything is installed:
35
39
  ```shell
36
- uv sync
40
+ uv add ohmyscrapper
41
+ uv run ohmyscrapper --version
42
+ ```
43
+
44
+ But you can use everything as a tool, for example:
45
+ ```shell
46
+ uvx ohmyscrapper --version
37
47
  ```
38
48
 
49
+
39
50
  ## How to use and test (development only)
40
51
 
41
52
  OhMyScrapper works in 3 stages:
42
53
 
43
- 1. It collects and loads urls from a text (by default `input/_chat.txt`) in a database;
54
+ 1. It collects and loads urls from a text in a database;
44
55
  2. It scraps/access the collected urls and read what is relevant. If it finds new urls, they are collected as well;
45
56
  3. Export a list of urls in CSV files;
46
57
 
47
58
  You can do 3 stages with the command:
48
59
  ```shell
49
- make start
60
+ ohmyscrapper start
50
61
  ```
51
- > Remember to add your text file in the folder `/input` with the name `_chat.txt`!
62
+ > Remember to add your text file in the folder `/input` with the name that finishes with `.txt`!
52
63
 
53
64
  You will find the exported files in the folder `/output` like this:
54
65
  - `/output/report.csv`
@@ -60,23 +71,28 @@ You will find the exported files in the folder `/output` like this:
60
71
 
61
72
  ### BUT: if you want to do step by step, here it is:
62
73
 
63
- First we load a text file you would like to look for urls, the idea here is to
64
- use the whatsapp history, but it works with any txt file.
74
+ First we load a text file you would like to look for urls. It it works with any txt file.
65
75
 
66
- The default file is `input/_chat.txt`. If you have the default file you just use
67
- the command `load`:
76
+ The default folder is `/input`. Put one or more text (finished with `.txt`) files
77
+ in this folder and use the command `load`:
78
+ ```shell
79
+ ohmyscrapper load
80
+ ```
81
+ or, if you have another file in a different folder, just use the argument `-input` like this:
68
82
  ```shell
69
- make load
83
+ ohmyscrapper load -input=my-text-file.txt
70
84
  ```
71
- or, if you have another file, just use the argument `-file` like this:
85
+ In this case, you can add an url directly to the database, like this:
72
86
  ```shell
73
- uv run main.py load -file=my-text-file.txt
87
+ ohmyscrapper load -input=https://cesarcardoso.cc/
74
88
  ```
89
+ That will append the last url in the database to be scraped.
90
+
75
91
  That will create a database if it doesn't exist and store every url the oh-my-scrapper
76
92
  find. After that, let's scrap the urls with the command `scrap-urls`:
77
93
 
78
94
  ```shell
79
- make scrap-urls
95
+ ohmyscrapper scrap-urls --recursive --ignore-type
80
96
  ```
81
97
 
82
98
  That will scrap only the linkedin urls we are interested in. For now they are:
@@ -88,23 +104,33 @@ That will scrap only the linkedin urls we are interested in. For now they are:
88
104
 
89
105
  But we can use every other one generically using the argument `--ignore-type`:
90
106
  ```shell
91
- uv run main.py scrap-urls --ignore-type
107
+ ohmyscrapper scrap-urls --ignore-type
92
108
  ```
93
109
 
94
110
  And we can ask to make it recursively adding the argument `--recursive`:
95
111
  ```shell
96
- uv run main.py scrap-urls --recursive
112
+ ohmyscrapper scrap-urls --recursive
97
113
  ```
98
114
  > !!! important: we are not sure about blocks we can have for excess of requests
99
115
 
100
116
  And we can finally export with the command:
101
117
  ```shell
102
- make export
118
+ ohmyscrapper export
119
+ ohmyscrapper export --file=output/urls-simplified.csv --simplify
120
+ ohmyscrapper report
103
121
  ```
104
122
 
105
123
 
106
124
  That's the basic usage!
107
125
  But you can understand more using the help:
108
126
  ```shell
109
- uv run main.py --help
127
+ ohmyscrapper --help
110
128
  ```
129
+
130
+ ## See Also
131
+
132
+ - Github: https://github.com/bouli/ohmyscrapper
133
+ - PyPI: https://pypi.org/project/ohmyscrapper/
134
+
135
+ ## License
136
+ This package is distributed under the [MIT license](https://opensource.org/license/MIT).
@@ -0,0 +1,20 @@
1
+ ohmyscrapper/__init__.py,sha256=TGOizxll-06nyJdYSM8SRUccQ5Xhv6dDNW6sIbuH0Mk,6493
2
+ ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
3
+ ohmyscrapper/core/config.py,sha256=_me0T6IQqz7bA6Kh6IofNrb-o-07nipcLozUuPrz0l4,2722
4
+ ohmyscrapper/core/config_files.py,sha256=KC3yChTnlclclU9EKTqFBoAu9p6XdOKuegub5NPYDDY,2434
5
+ ohmyscrapper/core/default_files/config.yaml,sha256=9nMOhnnJUcZudXUq5WBEXCCgezfUKI3m4azIuSch_wQ,214
6
+ ohmyscrapper/core/default_files/url_types.yaml,sha256=20kvv8_iWRT-pLa014RXYpAmPSonn6tDnG302rx7l-o,228
7
+ ohmyscrapper/models/urls_manager.py,sha256=93WvHnk89hA2BfJfDsD2JlZBeRxo2T_F3FfypiRKKHs,11523
8
+ ohmyscrapper/modules/classify_urls.py,sha256=4rt7_iPDcCGHhJg-f75wBfFmvjdvQj1xFFP-if_IeFM,926
9
+ ohmyscrapper/modules/load_txt.py,sha256=dNkUZ2ehBiPx-q4fPczRiHFvnpzCrjeycFtexhWGmEE,3967
10
+ ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
11
+ ohmyscrapper/modules/process_with_ai.py,sha256=Th-HMJzQYGQ4UBG8AGFsF5cCKIa1HlPATfmGLTTAE24,7222
12
+ ohmyscrapper/modules/scrap_urls.py,sha256=dxpvPyJWtmQj1vZ6IgnhcICWw1eOxYOeplDfZzDTLw4,6864
13
+ ohmyscrapper/modules/seed.py,sha256=qDUE7TWx9iNQEzqThK4p7g8pTZjdpkmoqI8kOo_zdtk,983
14
+ ohmyscrapper/modules/show.py,sha256=jsAs4g8ouA9wymkBfkDCbpVWKD-m_20uKG-m1cZAUGA,3877
15
+ ohmyscrapper/modules/sniff_url.py,sha256=dF6Nv54TC1Si-FRyqtw4V2WNk3NqaJ1h_PzwZm3UNzk,4126
16
+ ohmyscrapper/modules/untouch_all.py,sha256=DAwWYfqMFifHPtFCxSamu0AxHCgk6aJbTnBy6wLucXM,167
17
+ ohmyscrapper-0.6.1.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
18
+ ohmyscrapper-0.6.1.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
19
+ ohmyscrapper-0.6.1.dist-info/METADATA,sha256=k06ZCfkLkDuy_GvCj6jAFq2xfCUA5gN8cVlDH-2Q6Bs,4096
20
+ ohmyscrapper-0.6.1.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- ohmyscrapper/__init__.py,sha256=OOoRFtkBKaTIf74FStI0MGtk-LUQOuN0QnBZRfRWauA,5145
2
- ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
3
- ohmyscrapper/models/urls_manager.py,sha256=xKql_xdwfRwgpMyriuIrZ0Srz4gYQGMfWClEWpGRtNE,11183
4
- ohmyscrapper/modules/classify_urls.py,sha256=eyHtTHDZp2pGmYw_X-7LrbeVOgDPcRQdhu0oEuwQtKA,743
5
- ohmyscrapper/modules/load_txt.py,sha256=mL60OGsh-R80P88vxyqvfBEFag9yhSFFbg5pwtu1f90,889
6
- ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
7
- ohmyscrapper/modules/process_with_ai.py,sha256=TpumucIVNZulKOw2idy4hD3vG5IhG5pbhyJImYFP8g0,6844
8
- ohmyscrapper/modules/scrap_urls.py,sha256=KQVs3R03X80hmvvJAU1SqnNhwXEeVV99WlN8TxSKqA8,6097
9
- ohmyscrapper/modules/seed.py,sha256=KeTSbmTdNTkVCtzk9iQmeuEqB0kG-rTZJb2a1WdROL4,129
10
- ohmyscrapper/modules/show.py,sha256=u0L9uxgU8Xt_-myA3r7byuOmnX_-2gkpTtXWkXon1ns,3572
11
- ohmyscrapper/modules/sniff_url.py,sha256=jQDc7aSimuOOedw2fSXZlf6_o0OqQHOr6NsWb4n0XgI,2720
12
- ohmyscrapper/modules/untouch_all.py,sha256=E1U9e3sOG7suzc8ZTWcYiQQo9mPmLJ0piXdXUjFLEd4,162
13
- ohmyscrapper-0.2.1.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
14
- ohmyscrapper-0.2.1.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
15
- ohmyscrapper-0.2.1.dist-info/METADATA,sha256=Sl1HuVlxTSSAYz9ga0zJ9xUpWGY2NZOkNu1xTNtGUu8,3411
16
- ohmyscrapper-0.2.1.dist-info/RECORD,,