ohmyscrapper 0.2.3__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ohmyscrapper/__init__.py +44 -22
- ohmyscrapper/core/config.py +107 -0
- ohmyscrapper/core/config_files.py +73 -0
- ohmyscrapper/core/default_files/config.yaml +16 -0
- ohmyscrapper/core/default_files/url_sniffing.yaml +25 -0
- ohmyscrapper/core/default_files/url_types.yaml +5 -0
- ohmyscrapper/models/urls_manager.py +95 -41
- ohmyscrapper/modules/classify_urls.py +14 -6
- ohmyscrapper/modules/load_txt.py +79 -11
- ohmyscrapper/modules/process_with_ai.py +72 -36
- ohmyscrapper/modules/scrap_urls.py +130 -121
- ohmyscrapper/modules/seed.py +28 -2
- ohmyscrapper/modules/show.py +22 -14
- ohmyscrapper/modules/sniff_url.py +112 -45
- ohmyscrapper/modules/untouch_all.py +1 -1
- {ohmyscrapper-0.2.3.dist-info → ohmyscrapper-0.7.0.dist-info}/METADATA +21 -15
- ohmyscrapper-0.7.0.dist-info/RECORD +21 -0
- ohmyscrapper-0.2.3.dist-info/RECORD +0 -16
- {ohmyscrapper-0.2.3.dist-info → ohmyscrapper-0.7.0.dist-info}/WHEEL +0 -0
- {ohmyscrapper-0.2.3.dist-info → ohmyscrapper-0.7.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import ohmyscrapper.models.urls_manager as urls_manager
|
|
2
|
+
from ohmyscrapper.core import config
|
|
2
3
|
from bs4 import BeautifulSoup
|
|
3
4
|
from google import genai
|
|
4
5
|
from dotenv import load_dotenv
|
|
@@ -7,9 +8,11 @@ import time
|
|
|
7
8
|
import os
|
|
8
9
|
import yaml
|
|
9
10
|
import json
|
|
11
|
+
|
|
10
12
|
# TODO: !!! REFACTOR !!!
|
|
11
13
|
load_dotenv()
|
|
12
14
|
|
|
15
|
+
|
|
13
16
|
def reprocess_ai_history():
|
|
14
17
|
df = urls_manager.get_ai_log().to_dict(orient="records")
|
|
15
18
|
for row in df:
|
|
@@ -17,28 +20,34 @@ def reprocess_ai_history():
|
|
|
17
20
|
|
|
18
21
|
|
|
19
22
|
def process_ai_response(response):
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
23
|
+
job_positions = xml2dict(response)
|
|
24
|
+
|
|
25
|
+
for index, xml_item_children in job_positions.items():
|
|
26
|
+
for url_child_xml in xml_item_children:
|
|
27
|
+
|
|
28
|
+
url_parent = urls_manager.get_url_by_id(url_child_xml["id"])
|
|
29
|
+
if len(url_parent) > 0:
|
|
30
|
+
url_parent = url_parent.iloc[0]
|
|
31
|
+
title = url_child_xml.copy()
|
|
32
|
+
del title["id"]
|
|
33
|
+
del title["url"]
|
|
34
|
+
title = " - ".join(title.values())
|
|
35
|
+
if url_parent["description_links"] > 1 and url_child_xml["id"] != "":
|
|
36
|
+
print("-- child updated -- \n", url_child_xml["url"], ":", title)
|
|
37
|
+
urls_manager.set_url_title(url_child_xml["url"], title)
|
|
38
|
+
urls_manager.set_url_ai_processed_by_url(
|
|
39
|
+
url_child_xml["url"], str(json.dumps(url_child_xml))
|
|
40
|
+
)
|
|
41
|
+
if url_parent["url"] != url_child_xml["url"]:
|
|
42
|
+
urls_manager.set_url_ai_processed_by_url(
|
|
43
|
+
url_parent["url"], "children-update"
|
|
44
|
+
)
|
|
45
|
+
else:
|
|
46
|
+
print("-- parent updated -- \n", url_parent["url"], ":", title)
|
|
47
|
+
urls_manager.set_url_title(url_parent["url"], title)
|
|
48
|
+
urls_manager.set_url_ai_processed_by_url(
|
|
49
|
+
url_parent["url"], str(json.dumps(url_child_xml))
|
|
50
|
+
)
|
|
42
51
|
|
|
43
52
|
|
|
44
53
|
def xml2dict(xml_string):
|
|
@@ -46,19 +55,21 @@ def xml2dict(xml_string):
|
|
|
46
55
|
|
|
47
56
|
children_items_dict = {}
|
|
48
57
|
for item in soup.find_all():
|
|
49
|
-
if
|
|
58
|
+
if item.parent.name == "[document]":
|
|
50
59
|
children_items_dict[item.name] = []
|
|
51
60
|
elif item.parent.name in children_items_dict:
|
|
52
61
|
children_items_dict[item.parent.name].append(_xml_children_to_dict(item))
|
|
53
62
|
|
|
54
63
|
return children_items_dict
|
|
55
64
|
|
|
65
|
+
|
|
56
66
|
def _xml_children_to_dict(xml):
|
|
57
67
|
item_dict = {}
|
|
58
68
|
for item in xml.find_all():
|
|
59
69
|
item_dict[item.name] = item.text
|
|
60
70
|
return item_dict
|
|
61
71
|
|
|
72
|
+
|
|
62
73
|
def process_with_ai(recursive=True, triggered_times=0):
|
|
63
74
|
triggered_times = triggered_times + 1
|
|
64
75
|
|
|
@@ -91,13 +102,23 @@ def process_with_ai(recursive=True, triggered_times=0):
|
|
|
91
102
|
print("prompt:", prompt["name"])
|
|
92
103
|
print("model:", prompt["model"])
|
|
93
104
|
print("description:", prompt["description"])
|
|
94
|
-
prompt["instructions"] = prompt["instructions"].replace(
|
|
105
|
+
prompt["instructions"] = prompt["instructions"].replace(
|
|
106
|
+
"{ohmyscrapper_texts}", texts
|
|
107
|
+
)
|
|
95
108
|
|
|
96
109
|
# The client gets the API key from the environment variable `GEMINI_API_KEY`.
|
|
97
110
|
client = genai.Client()
|
|
98
|
-
response = client.models.generate_content(
|
|
111
|
+
response = client.models.generate_content(
|
|
112
|
+
model=prompt["model"], contents=prompt["instructions"]
|
|
113
|
+
)
|
|
99
114
|
response = str(response.text)
|
|
100
|
-
urls_manager.add_ai_log(
|
|
115
|
+
urls_manager.add_ai_log(
|
|
116
|
+
instructions=prompt["instructions"],
|
|
117
|
+
response=response,
|
|
118
|
+
model=prompt["model"],
|
|
119
|
+
prompt_name=prompt["name"],
|
|
120
|
+
prompt_file=prompt["prompt_file"],
|
|
121
|
+
)
|
|
101
122
|
print(response)
|
|
102
123
|
print("^^^^^^")
|
|
103
124
|
process_ai_response(response=response)
|
|
@@ -114,7 +135,9 @@ def process_with_ai(recursive=True, triggered_times=0):
|
|
|
114
135
|
if triggered_times > 5:
|
|
115
136
|
print("!!! This is a break to prevent budget accident$.")
|
|
116
137
|
print("You triggered", triggered_times, "times the AI processing function.")
|
|
117
|
-
print(
|
|
138
|
+
print(
|
|
139
|
+
"If you are sure this is correct, you can re-call this function again."
|
|
140
|
+
)
|
|
118
141
|
print("Please, check it.")
|
|
119
142
|
return
|
|
120
143
|
|
|
@@ -122,8 +145,13 @@ def process_with_ai(recursive=True, triggered_times=0):
|
|
|
122
145
|
|
|
123
146
|
return
|
|
124
147
|
|
|
148
|
+
|
|
125
149
|
def _get_prompt():
|
|
126
|
-
prompts_path = "prompts"
|
|
150
|
+
prompts_path = config.get_dir(param="prompts")
|
|
151
|
+
default_prommpt_file = os.path.join(
|
|
152
|
+
prompts_path, config.get_ai("default_prompt_file")
|
|
153
|
+
)
|
|
154
|
+
|
|
127
155
|
default_prompt = """---
|
|
128
156
|
model: "gemini-2.5-flash"
|
|
129
157
|
name: "default-prompt"
|
|
@@ -133,15 +161,18 @@ Process with AI this prompt: {ohmyscrapper_texts}
|
|
|
133
161
|
"""
|
|
134
162
|
if not os.path.exists(prompts_path):
|
|
135
163
|
os.mkdir(prompts_path)
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
164
|
+
open(default_prommpt_file, "w").write(default_prompt)
|
|
165
|
+
print(
|
|
166
|
+
f"You didn't have a prompt file. One was created in the /{prompts_path} folder. You can change it there."
|
|
167
|
+
)
|
|
139
168
|
return False
|
|
140
169
|
|
|
141
170
|
prompt_files = os.listdir(prompts_path)
|
|
142
171
|
if len(prompt_files) == 0:
|
|
143
|
-
open(
|
|
144
|
-
print(
|
|
172
|
+
open(default_prommpt_file, "w").write(default_prompt)
|
|
173
|
+
print(
|
|
174
|
+
f"You didn't have a prompt file. One was created in the /{prompts_path} folder. You can change it there."
|
|
175
|
+
)
|
|
145
176
|
return False
|
|
146
177
|
prompt = {}
|
|
147
178
|
if len(prompt_files) == 1:
|
|
@@ -151,8 +182,10 @@ Process with AI this prompt: {ohmyscrapper_texts}
|
|
|
151
182
|
prompts = {}
|
|
152
183
|
for index, file in enumerate(prompt_files):
|
|
153
184
|
prompts[index] = _parse_prompt(prompts_path=prompts_path, prompt_file=file)
|
|
154
|
-
print(index, ":", prompts[index][
|
|
155
|
-
input_prompt = input(
|
|
185
|
+
print(index, ":", prompts[index]["name"])
|
|
186
|
+
input_prompt = input(
|
|
187
|
+
"Type the number of the prompt you want to use or 'q' to quit: "
|
|
188
|
+
)
|
|
156
189
|
if input_prompt == "q":
|
|
157
190
|
return False
|
|
158
191
|
try:
|
|
@@ -162,14 +195,17 @@ Process with AI this prompt: {ohmyscrapper_texts}
|
|
|
162
195
|
prompt = _get_prompt()
|
|
163
196
|
return prompt
|
|
164
197
|
|
|
198
|
+
|
|
165
199
|
def _parse_prompt(prompts_path, prompt_file):
|
|
166
200
|
prompt = {}
|
|
167
|
-
raw_prompt = open(
|
|
201
|
+
raw_prompt = open(os.path.join(prompts_path, prompt_file), "r").read().split("---")
|
|
168
202
|
prompt = yaml.safe_load(raw_prompt[1])
|
|
169
203
|
prompt["instructions"] = raw_prompt[2].strip()
|
|
170
204
|
prompt["prompt_file"] = prompt_file
|
|
171
205
|
|
|
172
206
|
return prompt
|
|
207
|
+
|
|
208
|
+
|
|
173
209
|
# TODO: Separate gemini from basic function
|
|
174
210
|
def _process_with_gemini(model, instructions):
|
|
175
211
|
response = """"""
|
|
@@ -2,141 +2,132 @@ import ohmyscrapper.models.urls_manager as urls_manager
|
|
|
2
2
|
import ohmyscrapper.modules.sniff_url as sniff_url
|
|
3
3
|
import ohmyscrapper.modules.load_txt as load_txt
|
|
4
4
|
import ohmyscrapper.modules.classify_urls as classify_urls
|
|
5
|
+
from ohmyscrapper.core import config
|
|
5
6
|
|
|
6
7
|
import time
|
|
7
8
|
import random
|
|
8
9
|
|
|
9
10
|
|
|
10
|
-
def
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
if url_report["total-a-links"] < 5:
|
|
14
|
-
if "first-a-link" in url_report.keys():
|
|
15
|
-
url_destiny = url_report["first-a-link"]
|
|
16
|
-
else:
|
|
17
|
-
urls_manager.set_url_error(url=url["url"], value="error: no first-a-link")
|
|
18
|
-
print("no url for:", url["url"])
|
|
19
|
-
return
|
|
20
|
-
else:
|
|
21
|
-
if "og:url" in url_report.keys():
|
|
22
|
-
url_destiny = url_report["og:url"]
|
|
23
|
-
else:
|
|
24
|
-
urls_manager.set_url_error(url=url["url"], value="error: no og:url")
|
|
25
|
-
print("no url for:", url["url"])
|
|
26
|
-
return
|
|
11
|
+
def scrap_url(url, verbose=False):
|
|
12
|
+
if url["url_type"] is None:
|
|
13
|
+
url["url_type"] = "generic"
|
|
27
14
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
urls_manager.set_url_destiny(url=url["url"], destiny=url_destiny)
|
|
15
|
+
if verbose:
|
|
16
|
+
print("\n\n", url["url_type"] + ":", url["url"])
|
|
31
17
|
|
|
18
|
+
try:
|
|
19
|
+
url_type = url["url_type"]
|
|
20
|
+
sniffing_config = config.get_url_sniffing()
|
|
21
|
+
|
|
22
|
+
if url_type not in sniffing_config:
|
|
23
|
+
default_type_sniffing = {
|
|
24
|
+
"bodytags": [{"h1": "title"}],
|
|
25
|
+
"metatags": [
|
|
26
|
+
{"og:title": "title"},
|
|
27
|
+
{"og:description": "description"},
|
|
28
|
+
{"description": "description"},
|
|
29
|
+
],
|
|
30
|
+
}
|
|
31
|
+
config.append_url_sniffing({url_type: default_type_sniffing})
|
|
32
|
+
sniffing_config = config.get_url_sniffing()
|
|
33
|
+
|
|
34
|
+
url_report = sniff_url.get_tags(
|
|
35
|
+
url=url["url"], sniffing_config=sniffing_config[url_type]
|
|
36
|
+
)
|
|
37
|
+
except Exception as e:
|
|
38
|
+
urls_manager.set_url_error(url=url["url"], value="error on scrapping")
|
|
39
|
+
urls_manager.touch_url(url=url["url"])
|
|
40
|
+
if verbose:
|
|
41
|
+
print("\n\n!!! ERROR FOR:", url["url"])
|
|
42
|
+
print(
|
|
43
|
+
"\n\n!!! you can check the URL using the command sniff-url",
|
|
44
|
+
url["url"],
|
|
45
|
+
"\n\n",
|
|
46
|
+
)
|
|
47
|
+
return
|
|
32
48
|
|
|
33
|
-
|
|
34
|
-
|
|
49
|
+
process_sniffed_url(
|
|
50
|
+
url_report=url_report,
|
|
51
|
+
url=url,
|
|
52
|
+
sniffing_config=sniffing_config[url_type],
|
|
53
|
+
verbose=verbose,
|
|
54
|
+
)
|
|
35
55
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
else:
|
|
39
|
-
urls_manager.set_url_error(url=url["url"], value="error: no og:url")
|
|
40
|
-
print("no url for:", url["url"])
|
|
41
|
-
return
|
|
56
|
+
urls_manager.set_url_json(url=url["url"], value=url_report["json"])
|
|
57
|
+
urls_manager.touch_url(url=url["url"])
|
|
42
58
|
|
|
43
|
-
|
|
44
|
-
urls_manager.add_url(url=url_destiny)
|
|
45
|
-
urls_manager.set_url_destiny(url=url["url"], destiny=url_destiny)
|
|
59
|
+
return
|
|
46
60
|
|
|
47
61
|
|
|
48
|
-
def
|
|
49
|
-
|
|
62
|
+
def process_sniffed_url(url_report, url, sniffing_config, verbose=False):
|
|
63
|
+
if verbose:
|
|
64
|
+
print(url["url_type"])
|
|
65
|
+
print(url["url"])
|
|
50
66
|
changed = False
|
|
51
|
-
if "h1" in url_report.keys():
|
|
52
|
-
print(url["url"], ": ", url_report["h1"])
|
|
53
|
-
urls_manager.set_url_h1(url=url["url"], value=url_report["h1"])
|
|
54
|
-
changed = True
|
|
55
|
-
elif "og:title" in url_report.keys():
|
|
56
|
-
print(url["url"], ": ", url_report["og:title"])
|
|
57
|
-
urls_manager.set_url_h1(url=url["url"], value=url_report["og:title"])
|
|
58
|
-
changed = True
|
|
59
67
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
68
|
+
db_fields = {}
|
|
69
|
+
db_fields["title"] = None
|
|
70
|
+
db_fields["description"] = None
|
|
71
|
+
db_fields["url_destiny"] = None
|
|
72
|
+
|
|
73
|
+
if "metatags" in sniffing_config.keys():
|
|
74
|
+
for tag, bd_field in sniffing_config["metatags"].items():
|
|
75
|
+
if tag in url_report.keys():
|
|
76
|
+
if bd_field[:1] == "+":
|
|
77
|
+
if db_fields[bd_field[1:]] is None:
|
|
78
|
+
db_fields[bd_field[1:]] = ""
|
|
79
|
+
db_fields[bd_field[1:]] = (
|
|
80
|
+
db_fields[bd_field[1:]] + " " + url_report[tag]
|
|
81
|
+
)
|
|
82
|
+
else:
|
|
83
|
+
db_fields[bd_field] = url_report[tag]
|
|
84
|
+
|
|
85
|
+
if "bodytags" in sniffing_config.keys():
|
|
86
|
+
for tag, bd_field in sniffing_config["bodytags"].items():
|
|
87
|
+
if tag in url_report.keys():
|
|
88
|
+
if bd_field[:1] == "+":
|
|
89
|
+
if db_fields[bd_field[1:]] is None:
|
|
90
|
+
db_fields[bd_field[1:]] = ""
|
|
91
|
+
db_fields[bd_field[1:]] = (
|
|
92
|
+
db_fields[bd_field[1:]] + " " + url_report[tag]
|
|
93
|
+
)
|
|
94
|
+
else:
|
|
95
|
+
db_fields[bd_field] = url_report[tag]
|
|
96
|
+
|
|
97
|
+
if (
|
|
98
|
+
"atags" in sniffing_config.keys()
|
|
99
|
+
and "first-tag-as-url_destiny" in sniffing_config["atags"].keys()
|
|
100
|
+
):
|
|
101
|
+
if (
|
|
102
|
+
url_report["total-a-links"]
|
|
103
|
+
< sniffing_config["atags"]["first-tag-as-url_destiny"]
|
|
104
|
+
):
|
|
105
|
+
if "first-a-link" in url_report.keys():
|
|
106
|
+
db_fields["url_destiny"] = url_report["first-a-link"]
|
|
107
|
+
|
|
108
|
+
if db_fields["title"] is not None:
|
|
109
|
+
urls_manager.set_url_title(url=url["url"], value=db_fields["title"])
|
|
65
110
|
changed = True
|
|
66
|
-
if not changed:
|
|
67
|
-
urls_manager.set_url_error(url=url["url"], value="error: no h1 or description")
|
|
68
111
|
|
|
112
|
+
if db_fields["description"] is not None:
|
|
113
|
+
urls_manager.set_url_description(url=url["url"], value=db_fields["description"])
|
|
114
|
+
description_links = load_txt.put_urls_from_string(
|
|
115
|
+
text_to_process=db_fields["description"], parent_url=url["url"]
|
|
116
|
+
)
|
|
117
|
+
urls_manager.set_url_description_links(url=url["url"], value=description_links)
|
|
69
118
|
|
|
70
|
-
def process_linkedin_post(url_report, url):
|
|
71
|
-
print("linkedin_post or generic")
|
|
72
|
-
print(url["url"])
|
|
73
|
-
changed = False
|
|
74
|
-
if "h1" in url_report.keys():
|
|
75
|
-
print(url["url"], ": ", url_report["h1"])
|
|
76
|
-
urls_manager.set_url_h1(url=url["url"], value=url_report["h1"])
|
|
77
|
-
changed = True
|
|
78
|
-
elif "og:title" in url_report.keys():
|
|
79
|
-
urls_manager.set_url_h1(url=url["url"], value=url_report["og:title"])
|
|
80
|
-
changed = True
|
|
81
|
-
description = None
|
|
82
|
-
if "description" in url_report.keys():
|
|
83
|
-
description = url_report["description"]
|
|
84
|
-
changed = True
|
|
85
|
-
elif "og:description" in url_report.keys():
|
|
86
|
-
description = url_report["og:description"]
|
|
87
119
|
changed = True
|
|
88
120
|
|
|
89
|
-
if
|
|
90
|
-
urls_manager.
|
|
91
|
-
|
|
92
|
-
|
|
121
|
+
if db_fields["url_destiny"] is not None:
|
|
122
|
+
urls_manager.add_url(url=db_fields["url_destiny"])
|
|
123
|
+
urls_manager.set_url_destiny(url=url["url"], destiny=db_fields["url_destiny"])
|
|
124
|
+
changed = True
|
|
93
125
|
|
|
94
126
|
if not changed:
|
|
95
|
-
urls_manager.set_url_error(
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
def scrap_url(url):
|
|
99
|
-
# TODO: Use get_urls_valid_prefix_by_id()
|
|
100
|
-
df = urls_manager.get_urls_valid_prefix()
|
|
101
|
-
|
|
102
|
-
# TODO: Need to change this
|
|
103
|
-
|
|
104
|
-
if url["url_type"] is None:
|
|
105
|
-
print("\n\ngeneric:", url["url"])
|
|
106
|
-
url["url_type"] = "generic"
|
|
107
|
-
else:
|
|
108
|
-
print("\n\n", url["url_type"] + ":", url["url"])
|
|
109
|
-
try:
|
|
110
|
-
url_report = sniff_url.get_tags(url=url["url"])
|
|
111
|
-
except Exception as e:
|
|
112
|
-
urls_manager.set_url_error(url=url["url"], value="error")
|
|
113
|
-
urls_manager.touch_url(url=url["url"])
|
|
114
|
-
print("\n\n!!! ERROR FOR:", url["url"])
|
|
115
|
-
print(
|
|
116
|
-
"\n\n!!! you can check the URL using the command sniff-url",
|
|
117
|
-
url["url"],
|
|
118
|
-
"\n\n",
|
|
127
|
+
urls_manager.set_url_error(
|
|
128
|
+
url=url["url"],
|
|
129
|
+
value="error: no title, url_destiny or description was founded",
|
|
119
130
|
)
|
|
120
|
-
return
|
|
121
|
-
|
|
122
|
-
# linkedin_redirect - linkedin (https://lnkd.in/)
|
|
123
|
-
if url["url_type"] == "linkedin_redirect":
|
|
124
|
-
process_linkedin_redirect(url_report=url_report, url=url)
|
|
125
|
-
|
|
126
|
-
# linkedin_feed - linkedin (https://%.linkedin.com/feed/)
|
|
127
|
-
if url["url_type"] == "linkedin_feed":
|
|
128
|
-
process_linkedin_feed(url_report=url_report, url=url)
|
|
129
|
-
|
|
130
|
-
# linkedin_job - linkedin (https://www.linkedin.com/jobs/)
|
|
131
|
-
if url["url_type"] == "linkedin_job":
|
|
132
|
-
process_linkedin_job(url_report=url_report, url=url)
|
|
133
|
-
|
|
134
|
-
# linkedin_job - linkedin (https://www.linkedin.com/jobs/)
|
|
135
|
-
if url["url_type"] == "linkedin_post" or url["url_type"] == "generic":
|
|
136
|
-
process_linkedin_post(url_report=url_report, url=url)
|
|
137
|
-
|
|
138
|
-
urls_manager.set_url_json(url=url["url"], value=url_report["json"])
|
|
139
|
-
urls_manager.touch_url(url=url["url"])
|
|
140
131
|
|
|
141
132
|
|
|
142
133
|
def isNaN(num):
|
|
@@ -144,35 +135,53 @@ def isNaN(num):
|
|
|
144
135
|
|
|
145
136
|
|
|
146
137
|
def scrap_urls(
|
|
147
|
-
recursive=False,
|
|
138
|
+
recursive=False,
|
|
139
|
+
ignore_valid_prefix=False,
|
|
140
|
+
randomize=False,
|
|
141
|
+
only_parents=True,
|
|
142
|
+
verbose=False,
|
|
143
|
+
n_urls=0,
|
|
148
144
|
):
|
|
145
|
+
limit = 10
|
|
149
146
|
classify_urls.classify_urls()
|
|
150
147
|
urls = urls_manager.get_untouched_urls(
|
|
151
148
|
ignore_valid_prefix=ignore_valid_prefix,
|
|
152
149
|
randomize=randomize,
|
|
153
150
|
only_parents=only_parents,
|
|
151
|
+
limit=limit,
|
|
154
152
|
)
|
|
155
153
|
if len(urls) == 0:
|
|
156
|
-
print("no urls to scrap")
|
|
154
|
+
print("📭 no urls to scrap")
|
|
155
|
+
if n_urls > 0:
|
|
156
|
+
print(f"-- 🗃️ {n_urls} scraped urls in total...")
|
|
157
|
+
print("scrapping is over...")
|
|
157
158
|
return
|
|
158
159
|
for index, url in urls.iterrows():
|
|
159
|
-
scrap_url(url)
|
|
160
|
-
|
|
161
|
-
wait = random.randint(15, 20)
|
|
162
160
|
wait = random.randint(1, 3)
|
|
163
|
-
print(
|
|
161
|
+
print(
|
|
162
|
+
"🐶 Scrapper is sleeping for", wait, "seconds before scraping next url..."
|
|
163
|
+
)
|
|
164
164
|
time.sleep(wait)
|
|
165
165
|
|
|
166
|
+
print("🐕 Scrapper is sniffing the url...")
|
|
167
|
+
scrap_url(url=url, verbose=verbose)
|
|
168
|
+
|
|
169
|
+
n_urls = n_urls + len(urls)
|
|
170
|
+
print(f"-- 🗃️ {n_urls} scraped urls...")
|
|
166
171
|
classify_urls.classify_urls()
|
|
167
172
|
if recursive:
|
|
168
173
|
wait = random.randint(5, 10)
|
|
169
|
-
print(
|
|
174
|
+
print(
|
|
175
|
+
f"🐶 Scrapper is sleeping for {wait} seconds before next round of {limit} urls"
|
|
176
|
+
)
|
|
170
177
|
time.sleep(wait)
|
|
171
178
|
scrap_urls(
|
|
172
179
|
recursive=recursive,
|
|
173
180
|
ignore_valid_prefix=ignore_valid_prefix,
|
|
174
181
|
randomize=randomize,
|
|
175
182
|
only_parents=only_parents,
|
|
183
|
+
verbose=verbose,
|
|
184
|
+
n_urls=n_urls,
|
|
176
185
|
)
|
|
177
186
|
else:
|
|
178
|
-
print("
|
|
187
|
+
print("scrapping is over...")
|
ohmyscrapper/modules/seed.py
CHANGED
|
@@ -1,7 +1,33 @@
|
|
|
1
1
|
import ohmyscrapper.models.urls_manager as urls_manager
|
|
2
|
+
from ohmyscrapper.core import config
|
|
2
3
|
|
|
3
4
|
|
|
4
5
|
def seed():
|
|
5
|
-
|
|
6
|
-
|
|
6
|
+
if not config.url_types_file_exists():
|
|
7
|
+
db_url_types = urls_manager.get_urls_valid_prefix()
|
|
8
|
+
if len(db_url_types) > 0:
|
|
9
|
+
export_url_types_to_file()
|
|
10
|
+
print("🪹 you have a new `url_types.yaml` based on your db! =)")
|
|
11
|
+
return
|
|
12
|
+
|
|
13
|
+
seeds = get_url_types_from_file()
|
|
14
|
+
|
|
15
|
+
if len(seeds) > 0:
|
|
16
|
+
urls_manager.seeds(seeds=seeds)
|
|
17
|
+
print("🫒 db seeded")
|
|
7
18
|
return
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_url_types_from_file():
|
|
22
|
+
url_types_from_file = config.get_url_types()
|
|
23
|
+
if url_types_from_file is None:
|
|
24
|
+
url_types_from_file = {}
|
|
25
|
+
return url_types_from_file
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def export_url_types_to_file():
|
|
29
|
+
url_types = urls_manager.get_urls_valid_prefix()
|
|
30
|
+
yaml_url_types = {}
|
|
31
|
+
for index, url_type in url_types.iterrows():
|
|
32
|
+
yaml_url_types[url_type["url_type"]] = url_type["url_prefix"]
|
|
33
|
+
config.append_url_types(yaml_url_types)
|
ohmyscrapper/modules/show.py
CHANGED
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
import ohmyscrapper.models.urls_manager as urls_manager
|
|
2
|
+
from ohmyscrapper.core import config
|
|
2
3
|
import math
|
|
4
|
+
import os
|
|
3
5
|
from rich.console import Console
|
|
4
6
|
from rich.table import Table
|
|
5
7
|
|
|
6
8
|
|
|
7
9
|
def export_urls(limit=0, csv_file="output/urls.csv", simplify=False):
|
|
10
|
+
output_folder = config.get_dir("output")
|
|
11
|
+
|
|
8
12
|
df = urls_manager.get_urls(limit=limit)
|
|
9
13
|
|
|
10
14
|
if simplify:
|
|
@@ -12,27 +16,31 @@ def export_urls(limit=0, csv_file="output/urls.csv", simplify=False):
|
|
|
12
16
|
|
|
13
17
|
df.to_csv(csv_file, index=False)
|
|
14
18
|
print("--------------------")
|
|
15
|
-
print("Urls exported to", csv_file)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
19
|
+
print("📊🖋️ Urls exported to", csv_file)
|
|
20
|
+
if "description" in df:
|
|
21
|
+
try:
|
|
22
|
+
df.replace(
|
|
23
|
+
{
|
|
24
|
+
"description": {r"\n": " "},
|
|
25
|
+
},
|
|
26
|
+
regex=True,
|
|
27
|
+
inplace=True,
|
|
28
|
+
)
|
|
29
|
+
except:
|
|
30
|
+
pass
|
|
24
31
|
df.to_html(csv_file + "-preview.html", index=False)
|
|
25
|
-
print("Urls preview exported to", csv_file + "-preview.html")
|
|
32
|
+
print("📜🖋️ Urls preview exported to", csv_file + "-preview.html")
|
|
26
33
|
print("--------------------")
|
|
27
34
|
|
|
28
35
|
|
|
29
36
|
def export_report(csv_file="output/report.csv"):
|
|
37
|
+
output_folder = config.get_dir("output")
|
|
30
38
|
df = urls_manager.get_urls_report()
|
|
31
39
|
|
|
32
40
|
df.to_csv(csv_file, index=False)
|
|
33
41
|
_clear_file(csv_file)
|
|
34
42
|
print("--------------------")
|
|
35
|
-
print("Urls report exported to", csv_file)
|
|
43
|
+
print("📊🖋️ Urls report exported to", csv_file)
|
|
36
44
|
|
|
37
45
|
df.replace(
|
|
38
46
|
{
|
|
@@ -44,9 +52,10 @@ def export_report(csv_file="output/report.csv"):
|
|
|
44
52
|
df.to_html(csv_file + "-preview.html", index=False)
|
|
45
53
|
_clear_file(csv_file + "-preview.html")
|
|
46
54
|
|
|
47
|
-
print("Urls report preview exported to", csv_file + "-preview.html")
|
|
55
|
+
print("📜🖋️ Urls report preview exported to", csv_file + "-preview.html")
|
|
48
56
|
print("--------------------")
|
|
49
57
|
|
|
58
|
+
|
|
50
59
|
# TODO: Add transformation layer
|
|
51
60
|
def _clear_file(txt_tile):
|
|
52
61
|
with open(txt_tile, "r") as f:
|
|
@@ -56,6 +65,7 @@ def _clear_file(txt_tile):
|
|
|
56
65
|
with open(txt_tile, "w") as f:
|
|
57
66
|
f.write(content)
|
|
58
67
|
|
|
68
|
+
|
|
59
69
|
def show_urls(limit=0, jump_to_page=0):
|
|
60
70
|
df = urls_manager.get_urls(limit=limit)
|
|
61
71
|
df.drop(columns=["json", "description"], inplace=True)
|
|
@@ -100,8 +110,6 @@ def show_urls(limit=0, jump_to_page=0):
|
|
|
100
110
|
|
|
101
111
|
return
|
|
102
112
|
|
|
103
|
-
return
|
|
104
|
-
|
|
105
113
|
|
|
106
114
|
# TODO: Change place
|
|
107
115
|
def show_table(df):
|