ohmyscrapper 0.2.3__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ohmyscrapper/__init__.py +44 -22
- ohmyscrapper/core/config.py +95 -0
- ohmyscrapper/core/config_files.py +73 -0
- ohmyscrapper/core/default_files/config.yaml +15 -0
- ohmyscrapper/core/default_files/url_types.yaml +5 -0
- ohmyscrapper/models/urls_manager.py +67 -25
- ohmyscrapper/modules/classify_urls.py +9 -5
- ohmyscrapper/modules/load_txt.py +79 -11
- ohmyscrapper/modules/process_with_ai.py +72 -36
- ohmyscrapper/modules/scrap_urls.py +80 -49
- ohmyscrapper/modules/seed.py +28 -2
- ohmyscrapper/modules/show.py +22 -14
- ohmyscrapper/modules/sniff_url.py +82 -38
- ohmyscrapper/modules/untouch_all.py +1 -1
- {ohmyscrapper-0.2.3.dist-info → ohmyscrapper-0.6.1.dist-info}/METADATA +21 -15
- ohmyscrapper-0.6.1.dist-info/RECORD +20 -0
- ohmyscrapper-0.2.3.dist-info/RECORD +0 -16
- {ohmyscrapper-0.2.3.dist-info → ohmyscrapper-0.6.1.dist-info}/WHEEL +0 -0
- {ohmyscrapper-0.2.3.dist-info → ohmyscrapper-0.6.1.dist-info}/entry_points.txt +0 -0
|
@@ -6,8 +6,11 @@ import json
|
|
|
6
6
|
def sniff_url(url="https://www.linkedin.com/in/cesardesouzacardoso/", silent=False):
|
|
7
7
|
if not silent:
|
|
8
8
|
print("checking url:", url)
|
|
9
|
-
|
|
10
|
-
|
|
9
|
+
|
|
10
|
+
r = requests.get(url=url)
|
|
11
|
+
soup = BeautifulSoup(r.text, "html.parser")
|
|
12
|
+
|
|
13
|
+
metatags_to_search = [
|
|
11
14
|
"description",
|
|
12
15
|
"og:url",
|
|
13
16
|
"og:title",
|
|
@@ -16,18 +19,62 @@ def sniff_url(url="https://www.linkedin.com/in/cesardesouzacardoso/", silent=Fal
|
|
|
16
19
|
"lnkd:url",
|
|
17
20
|
]
|
|
18
21
|
|
|
19
|
-
|
|
20
|
-
|
|
22
|
+
text_tags_to_search = {
|
|
23
|
+
"h1": "",
|
|
24
|
+
"h2": "|",
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
final_report = {}
|
|
28
|
+
final_report["scrapped-url"] = url
|
|
29
|
+
final_report.update(
|
|
30
|
+
_extract_meta_tags(
|
|
31
|
+
soup=soup, silent=silent, metatags_to_search=metatags_to_search
|
|
32
|
+
)
|
|
33
|
+
)
|
|
34
|
+
final_report.update(
|
|
35
|
+
_extract_text_tags(
|
|
36
|
+
soup=soup, silent=silent, text_tags_to_search=text_tags_to_search
|
|
37
|
+
)
|
|
38
|
+
)
|
|
39
|
+
final_report["a_links"] = _extract_a_tags(soup=soup, silent=silent)
|
|
40
|
+
final_report = _complementary_report(final_report, soup, silent).copy()
|
|
41
|
+
final_report["json"] = json.dumps(final_report)
|
|
42
|
+
|
|
43
|
+
return final_report
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _extract_a_tags(soup, silent):
|
|
47
|
+
a_links = []
|
|
48
|
+
if not silent:
|
|
49
|
+
print("\n\n\n\n---- all <a> links ---")
|
|
50
|
+
|
|
51
|
+
i = 0
|
|
52
|
+
for a_tag in soup.find_all("a"):
|
|
53
|
+
i = i + 1
|
|
54
|
+
a_links.append({"text": a_tag.text, "href": a_tag.get("href")})
|
|
55
|
+
if not silent:
|
|
56
|
+
print("\n-- <a> link", i, "-- ")
|
|
57
|
+
print("target:", a_tag.get("target"))
|
|
58
|
+
print("text:", str(a_tag.text).strip())
|
|
59
|
+
print("href:", a_tag.get("href"))
|
|
60
|
+
print("-------------- ")
|
|
61
|
+
return a_links
|
|
21
62
|
|
|
63
|
+
|
|
64
|
+
def _extract_meta_tags(soup, silent, metatags_to_search):
|
|
65
|
+
valid_meta_tags = {}
|
|
22
66
|
if not silent:
|
|
23
67
|
print("\n\n\n\n---- all <meta> tags ---\n")
|
|
24
68
|
i = 0
|
|
25
69
|
for meta_tag in soup.find_all("meta"):
|
|
26
70
|
if (
|
|
27
|
-
meta_tag.get("name") in
|
|
28
|
-
or meta_tag.get("property") in
|
|
71
|
+
meta_tag.get("name") in metatags_to_search
|
|
72
|
+
or meta_tag.get("property") in metatags_to_search
|
|
29
73
|
):
|
|
30
|
-
|
|
74
|
+
if meta_tag.get("name") is not None:
|
|
75
|
+
valid_meta_tags[meta_tag.get("name")] = meta_tag.get("content")
|
|
76
|
+
elif meta_tag.get("property") is not None:
|
|
77
|
+
valid_meta_tags[meta_tag.get("property")] = meta_tag.get("content")
|
|
31
78
|
i = i + 1
|
|
32
79
|
if not silent:
|
|
33
80
|
print("-- meta tag", i, "--")
|
|
@@ -35,51 +82,48 @@ def sniff_url(url="https://www.linkedin.com/in/cesardesouzacardoso/", silent=Fal
|
|
|
35
82
|
print("property:", meta_tag.get("property"))
|
|
36
83
|
print("content:", meta_tag.get("content"))
|
|
37
84
|
print("---------------- \n")
|
|
85
|
+
return valid_meta_tags
|
|
86
|
+
|
|
38
87
|
|
|
88
|
+
def _extract_text_tags(soup, silent, text_tags_to_search):
|
|
89
|
+
valid_text_tags = {}
|
|
39
90
|
if not silent:
|
|
40
|
-
print("\n\n\n\n---- all <
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
91
|
+
print("\n\n\n\n---- all <text> tags ---\n")
|
|
92
|
+
i = 0
|
|
93
|
+
for text_tag, separator in text_tags_to_search.items():
|
|
94
|
+
if len(soup.find_all(text_tag)) > 0:
|
|
95
|
+
valid_text_tags[text_tag] = []
|
|
96
|
+
for obj_tag in soup.find_all(text_tag):
|
|
97
|
+
valid_text_tags[text_tag].append(obj_tag.text.strip())
|
|
98
|
+
valid_text_tags[text_tag] = separator.join(valid_text_tags[text_tag])
|
|
99
|
+
i = i + 1
|
|
100
|
+
if not silent:
|
|
101
|
+
print("-- text tag", i, "--")
|
|
102
|
+
print("name:", text_tag)
|
|
103
|
+
print("separator:", separator)
|
|
104
|
+
print("texts:", valid_text_tags[text_tag])
|
|
105
|
+
print("---------------- \n")
|
|
106
|
+
return valid_text_tags
|
|
49
107
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
if report_meta_tag.get("name") is not None:
|
|
57
|
-
final_report[report_meta_tag.get("name")] = report_meta_tag.get("content")
|
|
58
|
-
elif report_meta_tag.get("property") is not None:
|
|
59
|
-
final_report[report_meta_tag.get("property")] = report_meta_tag.get(
|
|
60
|
-
"content"
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
if len(soup.find_all("a")) > 0:
|
|
64
|
-
final_report["first-a-link"] = soup.find("a").get("href")
|
|
65
|
-
final_report["total-a-links"] = len(soup.find_all("a"))
|
|
108
|
+
|
|
109
|
+
def _complementary_report(final_report, soup, silent):
|
|
110
|
+
|
|
111
|
+
if len(final_report["a_links"]) > 0:
|
|
112
|
+
final_report["first-a-link"] = final_report["a_links"][0]["href"]
|
|
113
|
+
final_report["total-a-links"] = len(final_report["a_links"])
|
|
66
114
|
else:
|
|
67
115
|
final_report["first-a-link"] = ""
|
|
68
116
|
final_report["total-a-links"] = 0
|
|
69
117
|
|
|
70
|
-
if len(soup.find_all("h2")) > 0:
|
|
71
|
-
final_report["h2"] = soup.find("h2").text
|
|
72
|
-
|
|
73
118
|
if len(soup.find_all("meta")) > 0:
|
|
74
119
|
final_report["total-meta-tags"] = len(soup.find_all("meta"))
|
|
75
120
|
else:
|
|
76
121
|
final_report["total-meta-tags"] = 0
|
|
77
|
-
|
|
78
|
-
final_report["json"] = json.dumps(final_report)
|
|
79
122
|
if not silent:
|
|
80
123
|
print("\n\n\n----report---\n")
|
|
81
124
|
for key in final_report:
|
|
82
|
-
|
|
125
|
+
if key != "a_links":
|
|
126
|
+
print("* ", key, ":", final_report[key])
|
|
83
127
|
|
|
84
128
|
return final_report
|
|
85
129
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: ohmyscrapper
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.6.1
|
|
4
|
+
Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
|
|
5
5
|
Author: Cesar Cardoso
|
|
6
6
|
Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
|
|
7
7
|
Requires-Dist: beautifulsoup4>=4.14.3
|
|
@@ -16,16 +16,17 @@ Requires-Dist: urlextract>=1.9.0
|
|
|
16
16
|
Requires-Python: >=3.11
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
|
|
19
|
-
# OhMyScrapper - v0.
|
|
19
|
+
# 🐶 OhMyScrapper - v0.6.1
|
|
20
20
|
|
|
21
|
-
|
|
22
|
-
final
|
|
21
|
+
OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
|
|
22
|
+
final report with general information about job positions.
|
|
23
23
|
|
|
24
24
|
## Scope
|
|
25
25
|
|
|
26
26
|
- Read texts;
|
|
27
|
-
- Extract
|
|
28
|
-
-
|
|
27
|
+
- Extract and load urls;
|
|
28
|
+
- Scrapes the urls looking for og:tags and titles;
|
|
29
|
+
- Export a list of links with relevant information;
|
|
29
30
|
|
|
30
31
|
## Installation
|
|
31
32
|
|
|
@@ -50,7 +51,7 @@ uvx ohmyscrapper --version
|
|
|
50
51
|
|
|
51
52
|
OhMyScrapper works in 3 stages:
|
|
52
53
|
|
|
53
|
-
1. It collects and loads urls from a text
|
|
54
|
+
1. It collects and loads urls from a text in a database;
|
|
54
55
|
2. It scraps/access the collected urls and read what is relevant. If it finds new urls, they are collected as well;
|
|
55
56
|
3. Export a list of urls in CSV files;
|
|
56
57
|
|
|
@@ -58,7 +59,7 @@ You can do 3 stages with the command:
|
|
|
58
59
|
```shell
|
|
59
60
|
ohmyscrapper start
|
|
60
61
|
```
|
|
61
|
-
> Remember to add your text file in the folder `/input` with the name
|
|
62
|
+
> Remember to add your text file in the folder `/input` with the name that finishes with `.txt`!
|
|
62
63
|
|
|
63
64
|
You will find the exported files in the folder `/output` like this:
|
|
64
65
|
- `/output/report.csv`
|
|
@@ -70,18 +71,23 @@ You will find the exported files in the folder `/output` like this:
|
|
|
70
71
|
|
|
71
72
|
### BUT: if you want to do step by step, here it is:
|
|
72
73
|
|
|
73
|
-
First we load a text file you would like to look for urls
|
|
74
|
-
use the whatsapp history, but it works with any txt file.
|
|
74
|
+
First we load a text file you would like to look for urls. It it works with any txt file.
|
|
75
75
|
|
|
76
|
-
The default
|
|
77
|
-
the command `load`:
|
|
76
|
+
The default folder is `/input`. Put one or more text (finished with `.txt`) files
|
|
77
|
+
in this folder and use the command `load`:
|
|
78
78
|
```shell
|
|
79
79
|
ohmyscrapper load
|
|
80
80
|
```
|
|
81
|
-
or, if you have another file, just use the argument `-
|
|
81
|
+
or, if you have another file in a different folder, just use the argument `-input` like this:
|
|
82
82
|
```shell
|
|
83
|
-
ohmyscrapper load -
|
|
83
|
+
ohmyscrapper load -input=my-text-file.txt
|
|
84
84
|
```
|
|
85
|
+
In this case, you can add an url directly to the database, like this:
|
|
86
|
+
```shell
|
|
87
|
+
ohmyscrapper load -input=https://cesarcardoso.cc/
|
|
88
|
+
```
|
|
89
|
+
That will append the last url in the database to be scraped.
|
|
90
|
+
|
|
85
91
|
That will create a database if it doesn't exist and store every url the oh-my-scrapper
|
|
86
92
|
find. After that, let's scrap the urls with the command `scrap-urls`:
|
|
87
93
|
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
ohmyscrapper/__init__.py,sha256=TGOizxll-06nyJdYSM8SRUccQ5Xhv6dDNW6sIbuH0Mk,6493
|
|
2
|
+
ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
|
|
3
|
+
ohmyscrapper/core/config.py,sha256=_me0T6IQqz7bA6Kh6IofNrb-o-07nipcLozUuPrz0l4,2722
|
|
4
|
+
ohmyscrapper/core/config_files.py,sha256=KC3yChTnlclclU9EKTqFBoAu9p6XdOKuegub5NPYDDY,2434
|
|
5
|
+
ohmyscrapper/core/default_files/config.yaml,sha256=9nMOhnnJUcZudXUq5WBEXCCgezfUKI3m4azIuSch_wQ,214
|
|
6
|
+
ohmyscrapper/core/default_files/url_types.yaml,sha256=20kvv8_iWRT-pLa014RXYpAmPSonn6tDnG302rx7l-o,228
|
|
7
|
+
ohmyscrapper/models/urls_manager.py,sha256=93WvHnk89hA2BfJfDsD2JlZBeRxo2T_F3FfypiRKKHs,11523
|
|
8
|
+
ohmyscrapper/modules/classify_urls.py,sha256=4rt7_iPDcCGHhJg-f75wBfFmvjdvQj1xFFP-if_IeFM,926
|
|
9
|
+
ohmyscrapper/modules/load_txt.py,sha256=dNkUZ2ehBiPx-q4fPczRiHFvnpzCrjeycFtexhWGmEE,3967
|
|
10
|
+
ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
|
|
11
|
+
ohmyscrapper/modules/process_with_ai.py,sha256=Th-HMJzQYGQ4UBG8AGFsF5cCKIa1HlPATfmGLTTAE24,7222
|
|
12
|
+
ohmyscrapper/modules/scrap_urls.py,sha256=dxpvPyJWtmQj1vZ6IgnhcICWw1eOxYOeplDfZzDTLw4,6864
|
|
13
|
+
ohmyscrapper/modules/seed.py,sha256=qDUE7TWx9iNQEzqThK4p7g8pTZjdpkmoqI8kOo_zdtk,983
|
|
14
|
+
ohmyscrapper/modules/show.py,sha256=jsAs4g8ouA9wymkBfkDCbpVWKD-m_20uKG-m1cZAUGA,3877
|
|
15
|
+
ohmyscrapper/modules/sniff_url.py,sha256=dF6Nv54TC1Si-FRyqtw4V2WNk3NqaJ1h_PzwZm3UNzk,4126
|
|
16
|
+
ohmyscrapper/modules/untouch_all.py,sha256=DAwWYfqMFifHPtFCxSamu0AxHCgk6aJbTnBy6wLucXM,167
|
|
17
|
+
ohmyscrapper-0.6.1.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
|
|
18
|
+
ohmyscrapper-0.6.1.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
|
|
19
|
+
ohmyscrapper-0.6.1.dist-info/METADATA,sha256=k06ZCfkLkDuy_GvCj6jAFq2xfCUA5gN8cVlDH-2Q6Bs,4096
|
|
20
|
+
ohmyscrapper-0.6.1.dist-info/RECORD,,
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
ohmyscrapper/__init__.py,sha256=6k-fyuKkTefy5lJiRJFDP7wfkGhYRC8qgdfYlNB_3sk,5841
|
|
2
|
-
ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
|
|
3
|
-
ohmyscrapper/models/urls_manager.py,sha256=xKql_xdwfRwgpMyriuIrZ0Srz4gYQGMfWClEWpGRtNE,11183
|
|
4
|
-
ohmyscrapper/modules/classify_urls.py,sha256=eyHtTHDZp2pGmYw_X-7LrbeVOgDPcRQdhu0oEuwQtKA,743
|
|
5
|
-
ohmyscrapper/modules/load_txt.py,sha256=mL60OGsh-R80P88vxyqvfBEFag9yhSFFbg5pwtu1f90,889
|
|
6
|
-
ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
|
|
7
|
-
ohmyscrapper/modules/process_with_ai.py,sha256=TpumucIVNZulKOw2idy4hD3vG5IhG5pbhyJImYFP8g0,6844
|
|
8
|
-
ohmyscrapper/modules/scrap_urls.py,sha256=KQVs3R03X80hmvvJAU1SqnNhwXEeVV99WlN8TxSKqA8,6097
|
|
9
|
-
ohmyscrapper/modules/seed.py,sha256=KeTSbmTdNTkVCtzk9iQmeuEqB0kG-rTZJb2a1WdROL4,129
|
|
10
|
-
ohmyscrapper/modules/show.py,sha256=u0L9uxgU8Xt_-myA3r7byuOmnX_-2gkpTtXWkXon1ns,3572
|
|
11
|
-
ohmyscrapper/modules/sniff_url.py,sha256=jQDc7aSimuOOedw2fSXZlf6_o0OqQHOr6NsWb4n0XgI,2720
|
|
12
|
-
ohmyscrapper/modules/untouch_all.py,sha256=E1U9e3sOG7suzc8ZTWcYiQQo9mPmLJ0piXdXUjFLEd4,162
|
|
13
|
-
ohmyscrapper-0.2.3.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
|
|
14
|
-
ohmyscrapper-0.2.3.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
|
|
15
|
-
ohmyscrapper-0.2.3.dist-info/METADATA,sha256=uwthvf7vwhb6H14KdbPLibzp1c3PaVmcJIYePmV8cRc,3832
|
|
16
|
-
ohmyscrapper-0.2.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|