ohmyscrapper 0.2.3__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ohmyscrapper/__init__.py +44 -22
- ohmyscrapper/core/config.py +95 -0
- ohmyscrapper/core/config_files.py +73 -0
- ohmyscrapper/core/default_files/config.yaml +15 -0
- ohmyscrapper/core/default_files/url_types.yaml +5 -0
- ohmyscrapper/models/urls_manager.py +67 -25
- ohmyscrapper/modules/classify_urls.py +9 -5
- ohmyscrapper/modules/load_txt.py +79 -11
- ohmyscrapper/modules/process_with_ai.py +72 -36
- ohmyscrapper/modules/scrap_urls.py +80 -49
- ohmyscrapper/modules/seed.py +28 -2
- ohmyscrapper/modules/show.py +22 -14
- ohmyscrapper/modules/sniff_url.py +82 -38
- ohmyscrapper/modules/untouch_all.py +1 -1
- {ohmyscrapper-0.2.3.dist-info → ohmyscrapper-0.6.1.dist-info}/METADATA +21 -15
- ohmyscrapper-0.6.1.dist-info/RECORD +20 -0
- ohmyscrapper-0.2.3.dist-info/RECORD +0 -16
- {ohmyscrapper-0.2.3.dist-info → ohmyscrapper-0.6.1.dist-info}/WHEEL +0 -0
- {ohmyscrapper-0.2.3.dist-info → ohmyscrapper-0.6.1.dist-info}/entry_points.txt +0 -0
ohmyscrapper/modules/load_txt.py
CHANGED
|
@@ -1,31 +1,99 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from urlextract import URLExtract
|
|
3
3
|
import ohmyscrapper.models.urls_manager as urls_manager
|
|
4
|
+
from ohmyscrapper.core import config
|
|
4
5
|
|
|
5
6
|
|
|
6
|
-
def
|
|
7
|
+
def _increment_file_name(text_file_content, file_name):
|
|
8
|
+
print(f"reading and loading file `{file_name}`... ")
|
|
9
|
+
with open(file_name, "r") as f:
|
|
10
|
+
return text_file_content + f.read()
|
|
7
11
|
|
|
8
|
-
if not os.path.exists("input"):
|
|
9
|
-
os.mkdir("input")
|
|
10
12
|
|
|
11
|
-
|
|
13
|
+
def load_txt(file_name="input", verbose=False):
|
|
14
|
+
input_folder = config.get_dir("input")
|
|
15
|
+
if not os.path.exists(input_folder):
|
|
16
|
+
os.mkdir(input_folder)
|
|
17
|
+
|
|
12
18
|
urls_manager.seeds()
|
|
13
|
-
# make it recursive for all files
|
|
14
|
-
text_file_content = open(file_name, "r").read()
|
|
15
19
|
|
|
16
|
-
|
|
20
|
+
text_file_content = ""
|
|
21
|
+
if file_name is not None and not os.path.isdir(file_name):
|
|
22
|
+
print(f"📖 reading file `{file_name}`... ")
|
|
23
|
+
if not os.path.exists(file_name):
|
|
24
|
+
if file_name.startswith("https://") or file_name.startswith("http://"):
|
|
25
|
+
text_file_content = " " + file_name + " "
|
|
26
|
+
else:
|
|
27
|
+
print(f"\n file `{file_name}` not found.")
|
|
28
|
+
return
|
|
29
|
+
else:
|
|
30
|
+
text_file_content = _increment_file_name(
|
|
31
|
+
text_file_content=text_file_content, file_name=file_name
|
|
32
|
+
)
|
|
33
|
+
else:
|
|
34
|
+
input_folder = config.get_dir("input")
|
|
35
|
+
print(f"📂 reading {input_folder} directory... ")
|
|
36
|
+
if file_name is None:
|
|
37
|
+
dir_files = input_folder
|
|
38
|
+
else:
|
|
39
|
+
dir_files = file_name
|
|
40
|
+
text_files = os.listdir(dir_files)
|
|
41
|
+
for file in text_files:
|
|
42
|
+
if not file.endswith(".txt"):
|
|
43
|
+
text_files.remove(file)
|
|
44
|
+
if len(text_files) == 0:
|
|
45
|
+
print(f"No text files found in {input_folder} directory!")
|
|
46
|
+
return
|
|
47
|
+
elif len(text_files) == 1:
|
|
48
|
+
print(f"📖 reading file `{dir_files}/{text_files[0]}`... ")
|
|
49
|
+
text_file_content = _increment_file_name(
|
|
50
|
+
text_file_content=text_file_content,
|
|
51
|
+
file_name=os.path.join(dir_files, text_files[0]),
|
|
52
|
+
)
|
|
53
|
+
else:
|
|
54
|
+
print("\nChoose a text file. Use `*` for process all and `q` to quit:")
|
|
55
|
+
for index, file in enumerate(text_files):
|
|
56
|
+
print(f"[{index}]:", os.path.join(dir_files, file))
|
|
57
|
+
|
|
58
|
+
text_file_option = -1
|
|
59
|
+
while text_file_option < 0 or text_file_option >= len(text_files):
|
|
60
|
+
text_file_option = input("Enter the file number: ")
|
|
61
|
+
if text_file_option == "*":
|
|
62
|
+
for file in text_files:
|
|
63
|
+
text_file_content = _increment_file_name(
|
|
64
|
+
text_file_content=text_file_content,
|
|
65
|
+
file_name=os.path.join(dir_files, file),
|
|
66
|
+
)
|
|
67
|
+
text_file_option = 0
|
|
68
|
+
elif text_file_option == "q":
|
|
69
|
+
return
|
|
70
|
+
elif text_file_option.isdigit():
|
|
71
|
+
text_file_option = int(text_file_option)
|
|
72
|
+
if text_file_option >= 0 and text_file_option < len(text_files):
|
|
73
|
+
text_file_content = _increment_file_name(
|
|
74
|
+
text_file_content=text_file_content,
|
|
75
|
+
file_name=os.path.join(
|
|
76
|
+
dir_files, text_files[int(text_file_option)]
|
|
77
|
+
),
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
print("🔎 looking for urls...")
|
|
81
|
+
urls_found = put_urls_from_string(
|
|
82
|
+
text_to_process=text_file_content, verbose=verbose
|
|
83
|
+
)
|
|
17
84
|
|
|
18
|
-
# move_it_to_processed
|
|
19
85
|
print("--------------------")
|
|
20
|
-
print(
|
|
86
|
+
print("files processed")
|
|
87
|
+
print(f"📦 {urls_found} urls were extracted and packed into the database")
|
|
21
88
|
|
|
22
89
|
|
|
23
|
-
def put_urls_from_string(text_to_process, parent_url=None):
|
|
90
|
+
def put_urls_from_string(text_to_process, parent_url=None, verbose=False):
|
|
24
91
|
if isinstance(text_to_process, str):
|
|
25
92
|
extractor = URLExtract()
|
|
26
93
|
for url in extractor.find_urls(text_to_process):
|
|
27
94
|
urls_manager.add_url(url=url, parent_url=parent_url)
|
|
28
|
-
|
|
95
|
+
if verbose:
|
|
96
|
+
print(url, "added")
|
|
29
97
|
|
|
30
98
|
return len(extractor.find_urls(text_to_process))
|
|
31
99
|
else:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import ohmyscrapper.models.urls_manager as urls_manager
|
|
2
|
+
from ohmyscrapper.core import config
|
|
2
3
|
from bs4 import BeautifulSoup
|
|
3
4
|
from google import genai
|
|
4
5
|
from dotenv import load_dotenv
|
|
@@ -7,9 +8,11 @@ import time
|
|
|
7
8
|
import os
|
|
8
9
|
import yaml
|
|
9
10
|
import json
|
|
11
|
+
|
|
10
12
|
# TODO: !!! REFACTOR !!!
|
|
11
13
|
load_dotenv()
|
|
12
14
|
|
|
15
|
+
|
|
13
16
|
def reprocess_ai_history():
|
|
14
17
|
df = urls_manager.get_ai_log().to_dict(orient="records")
|
|
15
18
|
for row in df:
|
|
@@ -17,28 +20,34 @@ def reprocess_ai_history():
|
|
|
17
20
|
|
|
18
21
|
|
|
19
22
|
def process_ai_response(response):
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
23
|
+
job_positions = xml2dict(response)
|
|
24
|
+
|
|
25
|
+
for index, xml_item_children in job_positions.items():
|
|
26
|
+
for url_child_xml in xml_item_children:
|
|
27
|
+
|
|
28
|
+
url_parent = urls_manager.get_url_by_id(url_child_xml["id"])
|
|
29
|
+
if len(url_parent) > 0:
|
|
30
|
+
url_parent = url_parent.iloc[0]
|
|
31
|
+
h1 = url_child_xml.copy()
|
|
32
|
+
del h1["id"]
|
|
33
|
+
del h1["url"]
|
|
34
|
+
h1 = " - ".join(h1.values())
|
|
35
|
+
if url_parent["description_links"] > 1 and url_child_xml["id"] != "":
|
|
36
|
+
print("-- child updated -- \n", url_child_xml["url"], ":", h1)
|
|
37
|
+
urls_manager.set_url_h1(url_child_xml["url"], h1)
|
|
38
|
+
urls_manager.set_url_ai_processed_by_url(
|
|
39
|
+
url_child_xml["url"], str(json.dumps(url_child_xml))
|
|
40
|
+
)
|
|
41
|
+
if url_parent["url"] != url_child_xml["url"]:
|
|
42
|
+
urls_manager.set_url_ai_processed_by_url(
|
|
43
|
+
url_parent["url"], "children-update"
|
|
44
|
+
)
|
|
45
|
+
else:
|
|
46
|
+
print("-- parent updated -- \n", url_parent["url"], ":", h1)
|
|
47
|
+
urls_manager.set_url_h1(url_parent["url"], h1)
|
|
48
|
+
urls_manager.set_url_ai_processed_by_url(
|
|
49
|
+
url_parent["url"], str(json.dumps(url_child_xml))
|
|
50
|
+
)
|
|
42
51
|
|
|
43
52
|
|
|
44
53
|
def xml2dict(xml_string):
|
|
@@ -46,19 +55,21 @@ def xml2dict(xml_string):
|
|
|
46
55
|
|
|
47
56
|
children_items_dict = {}
|
|
48
57
|
for item in soup.find_all():
|
|
49
|
-
if
|
|
58
|
+
if item.parent.name == "[document]":
|
|
50
59
|
children_items_dict[item.name] = []
|
|
51
60
|
elif item.parent.name in children_items_dict:
|
|
52
61
|
children_items_dict[item.parent.name].append(_xml_children_to_dict(item))
|
|
53
62
|
|
|
54
63
|
return children_items_dict
|
|
55
64
|
|
|
65
|
+
|
|
56
66
|
def _xml_children_to_dict(xml):
|
|
57
67
|
item_dict = {}
|
|
58
68
|
for item in xml.find_all():
|
|
59
69
|
item_dict[item.name] = item.text
|
|
60
70
|
return item_dict
|
|
61
71
|
|
|
72
|
+
|
|
62
73
|
def process_with_ai(recursive=True, triggered_times=0):
|
|
63
74
|
triggered_times = triggered_times + 1
|
|
64
75
|
|
|
@@ -91,13 +102,23 @@ def process_with_ai(recursive=True, triggered_times=0):
|
|
|
91
102
|
print("prompt:", prompt["name"])
|
|
92
103
|
print("model:", prompt["model"])
|
|
93
104
|
print("description:", prompt["description"])
|
|
94
|
-
prompt["instructions"] = prompt["instructions"].replace(
|
|
105
|
+
prompt["instructions"] = prompt["instructions"].replace(
|
|
106
|
+
"{ohmyscrapper_texts}", texts
|
|
107
|
+
)
|
|
95
108
|
|
|
96
109
|
# The client gets the API key from the environment variable `GEMINI_API_KEY`.
|
|
97
110
|
client = genai.Client()
|
|
98
|
-
response = client.models.generate_content(
|
|
111
|
+
response = client.models.generate_content(
|
|
112
|
+
model=prompt["model"], contents=prompt["instructions"]
|
|
113
|
+
)
|
|
99
114
|
response = str(response.text)
|
|
100
|
-
urls_manager.add_ai_log(
|
|
115
|
+
urls_manager.add_ai_log(
|
|
116
|
+
instructions=prompt["instructions"],
|
|
117
|
+
response=response,
|
|
118
|
+
model=prompt["model"],
|
|
119
|
+
prompt_name=prompt["name"],
|
|
120
|
+
prompt_file=prompt["prompt_file"],
|
|
121
|
+
)
|
|
101
122
|
print(response)
|
|
102
123
|
print("^^^^^^")
|
|
103
124
|
process_ai_response(response=response)
|
|
@@ -114,7 +135,9 @@ def process_with_ai(recursive=True, triggered_times=0):
|
|
|
114
135
|
if triggered_times > 5:
|
|
115
136
|
print("!!! This is a break to prevent budget accident$.")
|
|
116
137
|
print("You triggered", triggered_times, "times the AI processing function.")
|
|
117
|
-
print(
|
|
138
|
+
print(
|
|
139
|
+
"If you are sure this is correct, you can re-call this function again."
|
|
140
|
+
)
|
|
118
141
|
print("Please, check it.")
|
|
119
142
|
return
|
|
120
143
|
|
|
@@ -122,8 +145,13 @@ def process_with_ai(recursive=True, triggered_times=0):
|
|
|
122
145
|
|
|
123
146
|
return
|
|
124
147
|
|
|
148
|
+
|
|
125
149
|
def _get_prompt():
|
|
126
|
-
prompts_path = "prompts"
|
|
150
|
+
prompts_path = config.get_dir(param="prompts")
|
|
151
|
+
default_prommpt_file = os.path.join(
|
|
152
|
+
prompts_path, config.get_ai("default_prompt_file")
|
|
153
|
+
)
|
|
154
|
+
|
|
127
155
|
default_prompt = """---
|
|
128
156
|
model: "gemini-2.5-flash"
|
|
129
157
|
name: "default-prompt"
|
|
@@ -133,15 +161,18 @@ Process with AI this prompt: {ohmyscrapper_texts}
|
|
|
133
161
|
"""
|
|
134
162
|
if not os.path.exists(prompts_path):
|
|
135
163
|
os.mkdir(prompts_path)
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
164
|
+
open(default_prommpt_file, "w").write(default_prompt)
|
|
165
|
+
print(
|
|
166
|
+
f"You didn't have a prompt file. One was created in the /{prompts_path} folder. You can change it there."
|
|
167
|
+
)
|
|
139
168
|
return False
|
|
140
169
|
|
|
141
170
|
prompt_files = os.listdir(prompts_path)
|
|
142
171
|
if len(prompt_files) == 0:
|
|
143
|
-
open(
|
|
144
|
-
print(
|
|
172
|
+
open(default_prommpt_file, "w").write(default_prompt)
|
|
173
|
+
print(
|
|
174
|
+
f"You didn't have a prompt file. One was created in the /{prompts_path} folder. You can change it there."
|
|
175
|
+
)
|
|
145
176
|
return False
|
|
146
177
|
prompt = {}
|
|
147
178
|
if len(prompt_files) == 1:
|
|
@@ -151,8 +182,10 @@ Process with AI this prompt: {ohmyscrapper_texts}
|
|
|
151
182
|
prompts = {}
|
|
152
183
|
for index, file in enumerate(prompt_files):
|
|
153
184
|
prompts[index] = _parse_prompt(prompts_path=prompts_path, prompt_file=file)
|
|
154
|
-
print(index, ":", prompts[index][
|
|
155
|
-
input_prompt = input(
|
|
185
|
+
print(index, ":", prompts[index]["name"])
|
|
186
|
+
input_prompt = input(
|
|
187
|
+
"Type the number of the prompt you want to use or 'q' to quit: "
|
|
188
|
+
)
|
|
156
189
|
if input_prompt == "q":
|
|
157
190
|
return False
|
|
158
191
|
try:
|
|
@@ -162,14 +195,17 @@ Process with AI this prompt: {ohmyscrapper_texts}
|
|
|
162
195
|
prompt = _get_prompt()
|
|
163
196
|
return prompt
|
|
164
197
|
|
|
198
|
+
|
|
165
199
|
def _parse_prompt(prompts_path, prompt_file):
|
|
166
200
|
prompt = {}
|
|
167
|
-
raw_prompt = open(
|
|
201
|
+
raw_prompt = open(os.path.join(prompts_path, prompt_file), "r").read().split("---")
|
|
168
202
|
prompt = yaml.safe_load(raw_prompt[1])
|
|
169
203
|
prompt["instructions"] = raw_prompt[2].strip()
|
|
170
204
|
prompt["prompt_file"] = prompt_file
|
|
171
205
|
|
|
172
206
|
return prompt
|
|
207
|
+
|
|
208
|
+
|
|
173
209
|
# TODO: Separate gemini from basic function
|
|
174
210
|
def _process_with_gemini(model, instructions):
|
|
175
211
|
response = """"""
|
|
@@ -7,72 +7,87 @@ import time
|
|
|
7
7
|
import random
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
def process_linkedin_redirect(url_report, url):
|
|
11
|
-
|
|
10
|
+
def process_linkedin_redirect(url_report, url, verbose=False):
|
|
11
|
+
if verbose:
|
|
12
|
+
print("linkedin_redirect")
|
|
12
13
|
|
|
13
14
|
if url_report["total-a-links"] < 5:
|
|
14
15
|
if "first-a-link" in url_report.keys():
|
|
15
16
|
url_destiny = url_report["first-a-link"]
|
|
16
17
|
else:
|
|
17
18
|
urls_manager.set_url_error(url=url["url"], value="error: no first-a-link")
|
|
18
|
-
|
|
19
|
+
if verbose:
|
|
20
|
+
print("no url for:", url["url"])
|
|
19
21
|
return
|
|
20
22
|
else:
|
|
21
23
|
if "og:url" in url_report.keys():
|
|
22
24
|
url_destiny = url_report["og:url"]
|
|
23
25
|
else:
|
|
24
26
|
urls_manager.set_url_error(url=url["url"], value="error: no og:url")
|
|
25
|
-
|
|
27
|
+
if verbose:
|
|
28
|
+
print("no url for:", url["url"])
|
|
26
29
|
return
|
|
27
|
-
|
|
28
|
-
|
|
30
|
+
if verbose:
|
|
31
|
+
print(url["url"], ">>", url_destiny)
|
|
29
32
|
urls_manager.add_url(url=url_destiny)
|
|
30
33
|
urls_manager.set_url_destiny(url=url["url"], destiny=url_destiny)
|
|
31
34
|
|
|
32
35
|
|
|
33
|
-
def process_linkedin_feed(url_report, url):
|
|
34
|
-
|
|
36
|
+
def process_linkedin_feed(url_report, url, verbose=False):
|
|
37
|
+
if verbose:
|
|
38
|
+
print("linkedin_feed")
|
|
35
39
|
|
|
36
40
|
if "og:url" in url_report.keys():
|
|
37
41
|
url_destiny = url_report["og:url"]
|
|
38
42
|
else:
|
|
39
43
|
urls_manager.set_url_error(url=url["url"], value="error: no og:url")
|
|
40
|
-
|
|
44
|
+
if verbose:
|
|
45
|
+
print("no url for:", url["url"])
|
|
41
46
|
return
|
|
42
47
|
|
|
43
|
-
|
|
48
|
+
if verbose:
|
|
49
|
+
print(url["url"], ">>", url_destiny)
|
|
44
50
|
urls_manager.add_url(url=url_destiny)
|
|
45
51
|
urls_manager.set_url_destiny(url=url["url"], destiny=url_destiny)
|
|
46
52
|
|
|
47
53
|
|
|
48
|
-
def process_linkedin_job(url_report, url):
|
|
49
|
-
|
|
54
|
+
def process_linkedin_job(url_report, url, verbose=False):
|
|
55
|
+
if verbose:
|
|
56
|
+
print("linkedin_job")
|
|
50
57
|
changed = False
|
|
51
58
|
if "h1" in url_report.keys():
|
|
52
|
-
|
|
59
|
+
if verbose:
|
|
60
|
+
print(url["url"], ": ", url_report["h1"])
|
|
53
61
|
urls_manager.set_url_h1(url=url["url"], value=url_report["h1"])
|
|
54
62
|
changed = True
|
|
55
63
|
elif "og:title" in url_report.keys():
|
|
56
|
-
|
|
64
|
+
if verbose:
|
|
65
|
+
print(url["url"], ": ", url_report["og:title"])
|
|
57
66
|
urls_manager.set_url_h1(url=url["url"], value=url_report["og:title"])
|
|
58
67
|
changed = True
|
|
59
68
|
|
|
60
69
|
if "description" in url_report.keys():
|
|
61
|
-
urls_manager.set_url_description(
|
|
70
|
+
urls_manager.set_url_description(
|
|
71
|
+
url=url["url"], value=url_report["description"]
|
|
72
|
+
)
|
|
62
73
|
changed = True
|
|
63
74
|
elif "og:description" in url_report.keys():
|
|
64
|
-
urls_manager.set_url_description(
|
|
75
|
+
urls_manager.set_url_description(
|
|
76
|
+
url=url["url"], value=url_report["og:description"]
|
|
77
|
+
)
|
|
65
78
|
changed = True
|
|
66
79
|
if not changed:
|
|
67
80
|
urls_manager.set_url_error(url=url["url"], value="error: no h1 or description")
|
|
68
81
|
|
|
69
82
|
|
|
70
|
-
def process_linkedin_post(url_report, url):
|
|
71
|
-
|
|
72
|
-
|
|
83
|
+
def process_linkedin_post(url_report, url, verbose=False):
|
|
84
|
+
if verbose:
|
|
85
|
+
print("linkedin_post or generic")
|
|
86
|
+
print(url["url"])
|
|
73
87
|
changed = False
|
|
74
88
|
if "h1" in url_report.keys():
|
|
75
|
-
|
|
89
|
+
if verbose:
|
|
90
|
+
print(url["url"], ": ", url_report["h1"])
|
|
76
91
|
urls_manager.set_url_h1(url=url["url"], value=url_report["h1"])
|
|
77
92
|
changed = True
|
|
78
93
|
elif "og:title" in url_report.keys():
|
|
@@ -88,52 +103,50 @@ def process_linkedin_post(url_report, url):
|
|
|
88
103
|
|
|
89
104
|
if description is not None:
|
|
90
105
|
urls_manager.set_url_description(url=url["url"], value=description)
|
|
91
|
-
description_links = load_txt.put_urls_from_string(
|
|
106
|
+
description_links = load_txt.put_urls_from_string(
|
|
107
|
+
text_to_process=description, parent_url=url["url"]
|
|
108
|
+
)
|
|
92
109
|
urls_manager.set_url_description_links(url=url["url"], value=description_links)
|
|
93
110
|
|
|
94
111
|
if not changed:
|
|
95
112
|
urls_manager.set_url_error(url=url["url"], value="error: no h1 or description")
|
|
96
113
|
|
|
97
114
|
|
|
98
|
-
def scrap_url(url):
|
|
99
|
-
# TODO: Use get_urls_valid_prefix_by_id()
|
|
100
|
-
df = urls_manager.get_urls_valid_prefix()
|
|
101
|
-
|
|
115
|
+
def scrap_url(url, verbose=False):
|
|
102
116
|
# TODO: Need to change this
|
|
103
117
|
|
|
104
118
|
if url["url_type"] is None:
|
|
105
|
-
|
|
119
|
+
if verbose:
|
|
120
|
+
print("\n\ngeneric:", url["url"])
|
|
106
121
|
url["url_type"] = "generic"
|
|
107
122
|
else:
|
|
108
|
-
|
|
123
|
+
if verbose:
|
|
124
|
+
print("\n\n", url["url_type"] + ":", url["url"])
|
|
109
125
|
try:
|
|
110
126
|
url_report = sniff_url.get_tags(url=url["url"])
|
|
111
127
|
except Exception as e:
|
|
112
128
|
urls_manager.set_url_error(url=url["url"], value="error")
|
|
113
129
|
urls_manager.touch_url(url=url["url"])
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
130
|
+
if verbose:
|
|
131
|
+
print("\n\n!!! ERROR FOR:", url["url"])
|
|
132
|
+
print(
|
|
133
|
+
"\n\n!!! you can check the URL using the command sniff-url",
|
|
134
|
+
url["url"],
|
|
135
|
+
"\n\n",
|
|
136
|
+
)
|
|
120
137
|
return
|
|
121
138
|
|
|
122
|
-
# linkedin_redirect - linkedin (https://lnkd.in/)
|
|
123
139
|
if url["url_type"] == "linkedin_redirect":
|
|
124
|
-
process_linkedin_redirect(url_report=url_report, url=url)
|
|
140
|
+
process_linkedin_redirect(url_report=url_report, url=url, verbose=verbose)
|
|
125
141
|
|
|
126
|
-
# linkedin_feed - linkedin (https://%.linkedin.com/feed/)
|
|
127
142
|
if url["url_type"] == "linkedin_feed":
|
|
128
|
-
process_linkedin_feed(url_report=url_report, url=url)
|
|
143
|
+
process_linkedin_feed(url_report=url_report, url=url, verbose=verbose)
|
|
129
144
|
|
|
130
|
-
# linkedin_job - linkedin (https://www.linkedin.com/jobs/)
|
|
131
145
|
if url["url_type"] == "linkedin_job":
|
|
132
|
-
process_linkedin_job(url_report=url_report, url=url)
|
|
146
|
+
process_linkedin_job(url_report=url_report, url=url, verbose=verbose)
|
|
133
147
|
|
|
134
|
-
# linkedin_job - linkedin (https://www.linkedin.com/jobs/)
|
|
135
148
|
if url["url_type"] == "linkedin_post" or url["url_type"] == "generic":
|
|
136
|
-
process_linkedin_post(url_report=url_report, url=url)
|
|
149
|
+
process_linkedin_post(url_report=url_report, url=url, verbose=verbose)
|
|
137
150
|
|
|
138
151
|
urls_manager.set_url_json(url=url["url"], value=url_report["json"])
|
|
139
152
|
urls_manager.touch_url(url=url["url"])
|
|
@@ -144,35 +157,53 @@ def isNaN(num):
|
|
|
144
157
|
|
|
145
158
|
|
|
146
159
|
def scrap_urls(
|
|
147
|
-
recursive=False,
|
|
160
|
+
recursive=False,
|
|
161
|
+
ignore_valid_prefix=False,
|
|
162
|
+
randomize=False,
|
|
163
|
+
only_parents=True,
|
|
164
|
+
verbose=False,
|
|
165
|
+
n_urls=0,
|
|
148
166
|
):
|
|
167
|
+
limit = 10
|
|
149
168
|
classify_urls.classify_urls()
|
|
150
169
|
urls = urls_manager.get_untouched_urls(
|
|
151
170
|
ignore_valid_prefix=ignore_valid_prefix,
|
|
152
171
|
randomize=randomize,
|
|
153
172
|
only_parents=only_parents,
|
|
173
|
+
limit=limit,
|
|
154
174
|
)
|
|
155
175
|
if len(urls) == 0:
|
|
156
|
-
print("no urls to scrap")
|
|
176
|
+
print("📭 no urls to scrap")
|
|
177
|
+
if n_urls > 0:
|
|
178
|
+
print(f"-- 🗃️ {n_urls} scraped urls in total...")
|
|
179
|
+
print("scrapping is over...")
|
|
157
180
|
return
|
|
158
181
|
for index, url in urls.iterrows():
|
|
159
|
-
scrap_url(url)
|
|
160
|
-
|
|
161
|
-
wait = random.randint(15, 20)
|
|
162
182
|
wait = random.randint(1, 3)
|
|
163
|
-
print(
|
|
183
|
+
print(
|
|
184
|
+
"🐶 Scrapper is sleeping for", wait, "seconds before scraping next url..."
|
|
185
|
+
)
|
|
164
186
|
time.sleep(wait)
|
|
165
187
|
|
|
188
|
+
print("🐕 Scrapper is sniffing the url...")
|
|
189
|
+
scrap_url(url=url, verbose=verbose)
|
|
190
|
+
|
|
191
|
+
n_urls = n_urls + len(urls)
|
|
192
|
+
print(f"-- 🗃️ {n_urls} scraped urls...")
|
|
166
193
|
classify_urls.classify_urls()
|
|
167
194
|
if recursive:
|
|
168
195
|
wait = random.randint(5, 10)
|
|
169
|
-
print(
|
|
196
|
+
print(
|
|
197
|
+
f"🐶 Scrapper is sleeping for {wait} seconds before next round of {limit} urls"
|
|
198
|
+
)
|
|
170
199
|
time.sleep(wait)
|
|
171
200
|
scrap_urls(
|
|
172
201
|
recursive=recursive,
|
|
173
202
|
ignore_valid_prefix=ignore_valid_prefix,
|
|
174
203
|
randomize=randomize,
|
|
175
204
|
only_parents=only_parents,
|
|
205
|
+
verbose=verbose,
|
|
206
|
+
n_urls=n_urls,
|
|
176
207
|
)
|
|
177
208
|
else:
|
|
178
|
-
print("
|
|
209
|
+
print("scrapping is over...")
|
ohmyscrapper/modules/seed.py
CHANGED
|
@@ -1,7 +1,33 @@
|
|
|
1
1
|
import ohmyscrapper.models.urls_manager as urls_manager
|
|
2
|
+
from ohmyscrapper.core import config
|
|
2
3
|
|
|
3
4
|
|
|
4
5
|
def seed():
|
|
5
|
-
|
|
6
|
-
|
|
6
|
+
if not config.url_types_file_exists():
|
|
7
|
+
db_url_types = urls_manager.get_urls_valid_prefix()
|
|
8
|
+
if len(db_url_types) > 0:
|
|
9
|
+
export_url_types_to_file()
|
|
10
|
+
print("🪹 you have a new `url_types.yaml` based on your db! =)")
|
|
11
|
+
return
|
|
12
|
+
|
|
13
|
+
seeds = get_url_types_from_file()
|
|
14
|
+
|
|
15
|
+
if len(seeds) > 0:
|
|
16
|
+
urls_manager.seeds(seeds=seeds)
|
|
17
|
+
print("🫒 db seeded")
|
|
7
18
|
return
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_url_types_from_file():
|
|
22
|
+
url_types_from_file = config.get_url_types()
|
|
23
|
+
if url_types_from_file is None:
|
|
24
|
+
url_types_from_file = {}
|
|
25
|
+
return url_types_from_file
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def export_url_types_to_file():
|
|
29
|
+
url_types = urls_manager.get_urls_valid_prefix()
|
|
30
|
+
yaml_url_types = {}
|
|
31
|
+
for index, url_type in url_types.iterrows():
|
|
32
|
+
yaml_url_types[url_type["url_type"]] = url_type["url_prefix"]
|
|
33
|
+
config.append_url_types(yaml_url_types)
|
ohmyscrapper/modules/show.py
CHANGED
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
import ohmyscrapper.models.urls_manager as urls_manager
|
|
2
|
+
from ohmyscrapper.core import config
|
|
2
3
|
import math
|
|
4
|
+
import os
|
|
3
5
|
from rich.console import Console
|
|
4
6
|
from rich.table import Table
|
|
5
7
|
|
|
6
8
|
|
|
7
9
|
def export_urls(limit=0, csv_file="output/urls.csv", simplify=False):
|
|
10
|
+
output_folder = config.get_dir("output")
|
|
11
|
+
|
|
8
12
|
df = urls_manager.get_urls(limit=limit)
|
|
9
13
|
|
|
10
14
|
if simplify:
|
|
@@ -12,27 +16,31 @@ def export_urls(limit=0, csv_file="output/urls.csv", simplify=False):
|
|
|
12
16
|
|
|
13
17
|
df.to_csv(csv_file, index=False)
|
|
14
18
|
print("--------------------")
|
|
15
|
-
print("Urls exported to", csv_file)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
19
|
+
print("📊🖋️ Urls exported to", csv_file)
|
|
20
|
+
if "description" in df:
|
|
21
|
+
try:
|
|
22
|
+
df.replace(
|
|
23
|
+
{
|
|
24
|
+
"description": {r"\n": " "},
|
|
25
|
+
},
|
|
26
|
+
regex=True,
|
|
27
|
+
inplace=True,
|
|
28
|
+
)
|
|
29
|
+
except:
|
|
30
|
+
pass
|
|
24
31
|
df.to_html(csv_file + "-preview.html", index=False)
|
|
25
|
-
print("Urls preview exported to", csv_file + "-preview.html")
|
|
32
|
+
print("📜🖋️ Urls preview exported to", csv_file + "-preview.html")
|
|
26
33
|
print("--------------------")
|
|
27
34
|
|
|
28
35
|
|
|
29
36
|
def export_report(csv_file="output/report.csv"):
|
|
37
|
+
output_folder = config.get_dir("output")
|
|
30
38
|
df = urls_manager.get_urls_report()
|
|
31
39
|
|
|
32
40
|
df.to_csv(csv_file, index=False)
|
|
33
41
|
_clear_file(csv_file)
|
|
34
42
|
print("--------------------")
|
|
35
|
-
print("Urls report exported to", csv_file)
|
|
43
|
+
print("📊🖋️ Urls report exported to", csv_file)
|
|
36
44
|
|
|
37
45
|
df.replace(
|
|
38
46
|
{
|
|
@@ -44,9 +52,10 @@ def export_report(csv_file="output/report.csv"):
|
|
|
44
52
|
df.to_html(csv_file + "-preview.html", index=False)
|
|
45
53
|
_clear_file(csv_file + "-preview.html")
|
|
46
54
|
|
|
47
|
-
print("Urls report preview exported to", csv_file + "-preview.html")
|
|
55
|
+
print("📜🖋️ Urls report preview exported to", csv_file + "-preview.html")
|
|
48
56
|
print("--------------------")
|
|
49
57
|
|
|
58
|
+
|
|
50
59
|
# TODO: Add transformation layer
|
|
51
60
|
def _clear_file(txt_tile):
|
|
52
61
|
with open(txt_tile, "r") as f:
|
|
@@ -56,6 +65,7 @@ def _clear_file(txt_tile):
|
|
|
56
65
|
with open(txt_tile, "w") as f:
|
|
57
66
|
f.write(content)
|
|
58
67
|
|
|
68
|
+
|
|
59
69
|
def show_urls(limit=0, jump_to_page=0):
|
|
60
70
|
df = urls_manager.get_urls(limit=limit)
|
|
61
71
|
df.drop(columns=["json", "description"], inplace=True)
|
|
@@ -100,8 +110,6 @@ def show_urls(limit=0, jump_to_page=0):
|
|
|
100
110
|
|
|
101
111
|
return
|
|
102
112
|
|
|
103
|
-
return
|
|
104
|
-
|
|
105
113
|
|
|
106
114
|
# TODO: Change place
|
|
107
115
|
def show_table(df):
|