ohmyscrapper 0.2.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,31 +1,99 @@
1
1
  import os
2
2
  from urlextract import URLExtract
3
3
  import ohmyscrapper.models.urls_manager as urls_manager
4
+ from ohmyscrapper.core import config
4
5
 
5
6
 
6
- def load_txt(file_name="input/_chat.txt"):
7
+ def _increment_file_name(text_file_content, file_name):
8
+ print(f"reading and loading file `{file_name}`... ")
9
+ with open(file_name, "r") as f:
10
+ return text_file_content + f.read()
7
11
 
8
- if not os.path.exists("input"):
9
- os.mkdir("input")
10
12
 
11
- urls_manager.create_tables()
13
+ def load_txt(file_name="input", verbose=False):
14
+ input_folder = config.get_dir("input")
15
+ if not os.path.exists(input_folder):
16
+ os.mkdir(input_folder)
17
+
12
18
  urls_manager.seeds()
13
- # make it recursive for all files
14
- text_file_content = open(file_name, "r").read()
15
19
 
16
- put_urls_from_string(text_to_process=text_file_content)
20
+ text_file_content = ""
21
+ if file_name is not None and not os.path.isdir(file_name):
22
+ print(f"📖 reading file `{file_name}`... ")
23
+ if not os.path.exists(file_name):
24
+ if file_name.startswith("https://") or file_name.startswith("http://"):
25
+ text_file_content = " " + file_name + " "
26
+ else:
27
+ print(f"\n file `{file_name}` not found.")
28
+ return
29
+ else:
30
+ text_file_content = _increment_file_name(
31
+ text_file_content=text_file_content, file_name=file_name
32
+ )
33
+ else:
34
+ input_folder = config.get_dir("input")
35
+ print(f"📂 reading {input_folder} directory... ")
36
+ if file_name is None:
37
+ dir_files = input_folder
38
+ else:
39
+ dir_files = file_name
40
+ text_files = os.listdir(dir_files)
41
+ for file in text_files:
42
+ if not file.endswith(".txt"):
43
+ text_files.remove(file)
44
+ if len(text_files) == 0:
45
+ print(f"No text files found in {input_folder} directory!")
46
+ return
47
+ elif len(text_files) == 1:
48
+ print(f"📖 reading file `{dir_files}/{text_files[0]}`... ")
49
+ text_file_content = _increment_file_name(
50
+ text_file_content=text_file_content,
51
+ file_name=os.path.join(dir_files, text_files[0]),
52
+ )
53
+ else:
54
+ print("\nChoose a text file. Use `*` for process all and `q` to quit:")
55
+ for index, file in enumerate(text_files):
56
+ print(f"[{index}]:", os.path.join(dir_files, file))
57
+
58
+ text_file_option = -1
59
+ while text_file_option < 0 or text_file_option >= len(text_files):
60
+ text_file_option = input("Enter the file number: ")
61
+ if text_file_option == "*":
62
+ for file in text_files:
63
+ text_file_content = _increment_file_name(
64
+ text_file_content=text_file_content,
65
+ file_name=os.path.join(dir_files, file),
66
+ )
67
+ text_file_option = 0
68
+ elif text_file_option == "q":
69
+ return
70
+ elif text_file_option.isdigit():
71
+ text_file_option = int(text_file_option)
72
+ if text_file_option >= 0 and text_file_option < len(text_files):
73
+ text_file_content = _increment_file_name(
74
+ text_file_content=text_file_content,
75
+ file_name=os.path.join(
76
+ dir_files, text_files[int(text_file_option)]
77
+ ),
78
+ )
79
+
80
+ print("🔎 looking for urls...")
81
+ urls_found = put_urls_from_string(
82
+ text_to_process=text_file_content, verbose=verbose
83
+ )
17
84
 
18
- # move_it_to_processed
19
85
  print("--------------------")
20
- print(file_name, "processed")
86
+ print("files processed")
87
+ print(f"📦 {urls_found} urls were extracted and packed into the database")
21
88
 
22
89
 
23
- def put_urls_from_string(text_to_process, parent_url=None):
90
+ def put_urls_from_string(text_to_process, parent_url=None, verbose=False):
24
91
  if isinstance(text_to_process, str):
25
92
  extractor = URLExtract()
26
93
  for url in extractor.find_urls(text_to_process):
27
94
  urls_manager.add_url(url=url, parent_url=parent_url)
28
- print(url, "added")
95
+ if verbose:
96
+ print(url, "added")
29
97
 
30
98
  return len(extractor.find_urls(text_to_process))
31
99
  else:
@@ -1,4 +1,5 @@
1
1
  import ohmyscrapper.models.urls_manager as urls_manager
2
+ from ohmyscrapper.core import config
2
3
  from bs4 import BeautifulSoup
3
4
  from google import genai
4
5
  from dotenv import load_dotenv
@@ -7,9 +8,11 @@ import time
7
8
  import os
8
9
  import yaml
9
10
  import json
11
+
10
12
  # TODO: !!! REFACTOR !!!
11
13
  load_dotenv()
12
14
 
15
+
13
16
  def reprocess_ai_history():
14
17
  df = urls_manager.get_ai_log().to_dict(orient="records")
15
18
  for row in df:
@@ -17,28 +20,34 @@ def reprocess_ai_history():
17
20
 
18
21
 
19
22
  def process_ai_response(response):
20
- job_positions = xml2dict(response)
21
-
22
- for index, xml_item_children in job_positions.items():
23
- for url_child_xml in xml_item_children:
24
-
25
- url_parent = urls_manager.get_url_by_id(url_child_xml["id"])
26
- if len(url_parent) > 0:
27
- url_parent = url_parent.iloc[0]
28
- h1 = url_child_xml.copy()
29
- del h1["id"]
30
- del h1["url"]
31
- h1 = " - ".join(h1.values())
32
- if url_parent["description_links"] > 1 and url_child_xml["id"] != "":
33
- print("-- child updated -- \n", url_child_xml["url"] , ":", h1)
34
- urls_manager.set_url_h1(url_child_xml["url"], h1)
35
- urls_manager.set_url_ai_processed_by_url(url_child_xml["url"], str(json.dumps(url_child_xml)))
36
- if url_parent["url"] != url_child_xml["url"]:
37
- urls_manager.set_url_ai_processed_by_url(url_parent["url"], "children-update")
38
- else:
39
- print("-- parent updated -- \n", url_parent["url"], ":", h1)
40
- urls_manager.set_url_h1(url_parent["url"], h1)
41
- urls_manager.set_url_ai_processed_by_url(url_parent["url"], str(json.dumps(url_child_xml)))
23
+ job_positions = xml2dict(response)
24
+
25
+ for index, xml_item_children in job_positions.items():
26
+ for url_child_xml in xml_item_children:
27
+
28
+ url_parent = urls_manager.get_url_by_id(url_child_xml["id"])
29
+ if len(url_parent) > 0:
30
+ url_parent = url_parent.iloc[0]
31
+ h1 = url_child_xml.copy()
32
+ del h1["id"]
33
+ del h1["url"]
34
+ h1 = " - ".join(h1.values())
35
+ if url_parent["description_links"] > 1 and url_child_xml["id"] != "":
36
+ print("-- child updated -- \n", url_child_xml["url"], ":", h1)
37
+ urls_manager.set_url_h1(url_child_xml["url"], h1)
38
+ urls_manager.set_url_ai_processed_by_url(
39
+ url_child_xml["url"], str(json.dumps(url_child_xml))
40
+ )
41
+ if url_parent["url"] != url_child_xml["url"]:
42
+ urls_manager.set_url_ai_processed_by_url(
43
+ url_parent["url"], "children-update"
44
+ )
45
+ else:
46
+ print("-- parent updated -- \n", url_parent["url"], ":", h1)
47
+ urls_manager.set_url_h1(url_parent["url"], h1)
48
+ urls_manager.set_url_ai_processed_by_url(
49
+ url_parent["url"], str(json.dumps(url_child_xml))
50
+ )
42
51
 
43
52
 
44
53
  def xml2dict(xml_string):
@@ -46,19 +55,21 @@ def xml2dict(xml_string):
46
55
 
47
56
  children_items_dict = {}
48
57
  for item in soup.find_all():
49
- if(item.parent.name == "[document]"):
58
+ if item.parent.name == "[document]":
50
59
  children_items_dict[item.name] = []
51
60
  elif item.parent.name in children_items_dict:
52
61
  children_items_dict[item.parent.name].append(_xml_children_to_dict(item))
53
62
 
54
63
  return children_items_dict
55
64
 
65
+
56
66
  def _xml_children_to_dict(xml):
57
67
  item_dict = {}
58
68
  for item in xml.find_all():
59
69
  item_dict[item.name] = item.text
60
70
  return item_dict
61
71
 
72
+
62
73
  def process_with_ai(recursive=True, triggered_times=0):
63
74
  triggered_times = triggered_times + 1
64
75
 
@@ -91,13 +102,23 @@ def process_with_ai(recursive=True, triggered_times=0):
91
102
  print("prompt:", prompt["name"])
92
103
  print("model:", prompt["model"])
93
104
  print("description:", prompt["description"])
94
- prompt["instructions"] = prompt["instructions"].replace("{ohmyscrapper_texts}", texts)
105
+ prompt["instructions"] = prompt["instructions"].replace(
106
+ "{ohmyscrapper_texts}", texts
107
+ )
95
108
 
96
109
  # The client gets the API key from the environment variable `GEMINI_API_KEY`.
97
110
  client = genai.Client()
98
- response = client.models.generate_content(model=prompt["model"], contents=prompt["instructions"])
111
+ response = client.models.generate_content(
112
+ model=prompt["model"], contents=prompt["instructions"]
113
+ )
99
114
  response = str(response.text)
100
- urls_manager.add_ai_log(instructions=prompt["instructions"], response=response, model=prompt["model"], prompt_name=prompt["name"], prompt_file=prompt["prompt_file"])
115
+ urls_manager.add_ai_log(
116
+ instructions=prompt["instructions"],
117
+ response=response,
118
+ model=prompt["model"],
119
+ prompt_name=prompt["name"],
120
+ prompt_file=prompt["prompt_file"],
121
+ )
101
122
  print(response)
102
123
  print("^^^^^^")
103
124
  process_ai_response(response=response)
@@ -114,7 +135,9 @@ def process_with_ai(recursive=True, triggered_times=0):
114
135
  if triggered_times > 5:
115
136
  print("!!! This is a break to prevent budget accident$.")
116
137
  print("You triggered", triggered_times, "times the AI processing function.")
117
- print("If you are sure this is correct, you can re-call this function again.")
138
+ print(
139
+ "If you are sure this is correct, you can re-call this function again."
140
+ )
118
141
  print("Please, check it.")
119
142
  return
120
143
 
@@ -122,8 +145,13 @@ def process_with_ai(recursive=True, triggered_times=0):
122
145
 
123
146
  return
124
147
 
148
+
125
149
  def _get_prompt():
126
- prompts_path = "prompts"
150
+ prompts_path = config.get_dir(param="prompts")
151
+ default_prommpt_file = os.path.join(
152
+ prompts_path, config.get_ai("default_prompt_file")
153
+ )
154
+
127
155
  default_prompt = """---
128
156
  model: "gemini-2.5-flash"
129
157
  name: "default-prompt"
@@ -133,15 +161,18 @@ Process with AI this prompt: {ohmyscrapper_texts}
133
161
  """
134
162
  if not os.path.exists(prompts_path):
135
163
  os.mkdir(prompts_path)
136
-
137
- open(f"{prompts_path}/prompt.md", "w").write(default_prompt)
138
- print(f"You didn't have a prompt file. One was created in the /{prompts_path} folder. You can change it there.")
164
+ open(default_prommpt_file, "w").write(default_prompt)
165
+ print(
166
+ f"You didn't have a prompt file. One was created in the /{prompts_path} folder. You can change it there."
167
+ )
139
168
  return False
140
169
 
141
170
  prompt_files = os.listdir(prompts_path)
142
171
  if len(prompt_files) == 0:
143
- open(f"{prompts_path}/prompt.md", "w").write(default_prompt)
144
- print(f"You didn't have a prompt file. One was created in the /{prompts_path} folder. You can change it there.")
172
+ open(default_prommpt_file, "w").write(default_prompt)
173
+ print(
174
+ f"You didn't have a prompt file. One was created in the /{prompts_path} folder. You can change it there."
175
+ )
145
176
  return False
146
177
  prompt = {}
147
178
  if len(prompt_files) == 1:
@@ -151,8 +182,10 @@ Process with AI this prompt: {ohmyscrapper_texts}
151
182
  prompts = {}
152
183
  for index, file in enumerate(prompt_files):
153
184
  prompts[index] = _parse_prompt(prompts_path=prompts_path, prompt_file=file)
154
- print(index, ":", prompts[index]['name'])
155
- input_prompt = input("Type the number of the prompt you want to use or 'q' to quit: ")
185
+ print(index, ":", prompts[index]["name"])
186
+ input_prompt = input(
187
+ "Type the number of the prompt you want to use or 'q' to quit: "
188
+ )
156
189
  if input_prompt == "q":
157
190
  return False
158
191
  try:
@@ -162,14 +195,17 @@ Process with AI this prompt: {ohmyscrapper_texts}
162
195
  prompt = _get_prompt()
163
196
  return prompt
164
197
 
198
+
165
199
  def _parse_prompt(prompts_path, prompt_file):
166
200
  prompt = {}
167
- raw_prompt = open(f"{prompts_path}/{prompt_file}", "r").read().split("---")
201
+ raw_prompt = open(os.path.join(prompts_path, prompt_file), "r").read().split("---")
168
202
  prompt = yaml.safe_load(raw_prompt[1])
169
203
  prompt["instructions"] = raw_prompt[2].strip()
170
204
  prompt["prompt_file"] = prompt_file
171
205
 
172
206
  return prompt
207
+
208
+
173
209
  # TODO: Separate gemini from basic function
174
210
  def _process_with_gemini(model, instructions):
175
211
  response = """"""
@@ -7,72 +7,87 @@ import time
7
7
  import random
8
8
 
9
9
 
10
- def process_linkedin_redirect(url_report, url):
11
- print("linkedin_redirect")
10
+ def process_linkedin_redirect(url_report, url, verbose=False):
11
+ if verbose:
12
+ print("linkedin_redirect")
12
13
 
13
14
  if url_report["total-a-links"] < 5:
14
15
  if "first-a-link" in url_report.keys():
15
16
  url_destiny = url_report["first-a-link"]
16
17
  else:
17
18
  urls_manager.set_url_error(url=url["url"], value="error: no first-a-link")
18
- print("no url for:", url["url"])
19
+ if verbose:
20
+ print("no url for:", url["url"])
19
21
  return
20
22
  else:
21
23
  if "og:url" in url_report.keys():
22
24
  url_destiny = url_report["og:url"]
23
25
  else:
24
26
  urls_manager.set_url_error(url=url["url"], value="error: no og:url")
25
- print("no url for:", url["url"])
27
+ if verbose:
28
+ print("no url for:", url["url"])
26
29
  return
27
-
28
- print(url["url"], ">>", url_destiny)
30
+ if verbose:
31
+ print(url["url"], ">>", url_destiny)
29
32
  urls_manager.add_url(url=url_destiny)
30
33
  urls_manager.set_url_destiny(url=url["url"], destiny=url_destiny)
31
34
 
32
35
 
33
- def process_linkedin_feed(url_report, url):
34
- print("linkedin_feed")
36
+ def process_linkedin_feed(url_report, url, verbose=False):
37
+ if verbose:
38
+ print("linkedin_feed")
35
39
 
36
40
  if "og:url" in url_report.keys():
37
41
  url_destiny = url_report["og:url"]
38
42
  else:
39
43
  urls_manager.set_url_error(url=url["url"], value="error: no og:url")
40
- print("no url for:", url["url"])
44
+ if verbose:
45
+ print("no url for:", url["url"])
41
46
  return
42
47
 
43
- print(url["url"], ">>", url_destiny)
48
+ if verbose:
49
+ print(url["url"], ">>", url_destiny)
44
50
  urls_manager.add_url(url=url_destiny)
45
51
  urls_manager.set_url_destiny(url=url["url"], destiny=url_destiny)
46
52
 
47
53
 
48
- def process_linkedin_job(url_report, url):
49
- print("linkedin_job")
54
+ def process_linkedin_job(url_report, url, verbose=False):
55
+ if verbose:
56
+ print("linkedin_job")
50
57
  changed = False
51
58
  if "h1" in url_report.keys():
52
- print(url["url"], ": ", url_report["h1"])
59
+ if verbose:
60
+ print(url["url"], ": ", url_report["h1"])
53
61
  urls_manager.set_url_h1(url=url["url"], value=url_report["h1"])
54
62
  changed = True
55
63
  elif "og:title" in url_report.keys():
56
- print(url["url"], ": ", url_report["og:title"])
64
+ if verbose:
65
+ print(url["url"], ": ", url_report["og:title"])
57
66
  urls_manager.set_url_h1(url=url["url"], value=url_report["og:title"])
58
67
  changed = True
59
68
 
60
69
  if "description" in url_report.keys():
61
- urls_manager.set_url_description(url=url["url"], value=url_report["description"])
70
+ urls_manager.set_url_description(
71
+ url=url["url"], value=url_report["description"]
72
+ )
62
73
  changed = True
63
74
  elif "og:description" in url_report.keys():
64
- urls_manager.set_url_description(url=url["url"], value=url_report["og:description"])
75
+ urls_manager.set_url_description(
76
+ url=url["url"], value=url_report["og:description"]
77
+ )
65
78
  changed = True
66
79
  if not changed:
67
80
  urls_manager.set_url_error(url=url["url"], value="error: no h1 or description")
68
81
 
69
82
 
70
- def process_linkedin_post(url_report, url):
71
- print("linkedin_post or generic")
72
- print(url["url"])
83
+ def process_linkedin_post(url_report, url, verbose=False):
84
+ if verbose:
85
+ print("linkedin_post or generic")
86
+ print(url["url"])
73
87
  changed = False
74
88
  if "h1" in url_report.keys():
75
- print(url["url"], ": ", url_report["h1"])
89
+ if verbose:
90
+ print(url["url"], ": ", url_report["h1"])
76
91
  urls_manager.set_url_h1(url=url["url"], value=url_report["h1"])
77
92
  changed = True
78
93
  elif "og:title" in url_report.keys():
@@ -88,52 +103,50 @@ def process_linkedin_post(url_report, url):
88
103
 
89
104
  if description is not None:
90
105
  urls_manager.set_url_description(url=url["url"], value=description)
91
- description_links = load_txt.put_urls_from_string(text_to_process=description, parent_url=url["url"])
106
+ description_links = load_txt.put_urls_from_string(
107
+ text_to_process=description, parent_url=url["url"]
108
+ )
92
109
  urls_manager.set_url_description_links(url=url["url"], value=description_links)
93
110
 
94
111
  if not changed:
95
112
  urls_manager.set_url_error(url=url["url"], value="error: no h1 or description")
96
113
 
97
114
 
98
- def scrap_url(url):
99
- # TODO: Use get_urls_valid_prefix_by_id()
100
- df = urls_manager.get_urls_valid_prefix()
101
-
115
+ def scrap_url(url, verbose=False):
102
116
  # TODO: Need to change this
103
117
 
104
118
  if url["url_type"] is None:
105
- print("\n\ngeneric:", url["url"])
119
+ if verbose:
120
+ print("\n\ngeneric:", url["url"])
106
121
  url["url_type"] = "generic"
107
122
  else:
108
- print("\n\n", url["url_type"] + ":", url["url"])
123
+ if verbose:
124
+ print("\n\n", url["url_type"] + ":", url["url"])
109
125
  try:
110
126
  url_report = sniff_url.get_tags(url=url["url"])
111
127
  except Exception as e:
112
128
  urls_manager.set_url_error(url=url["url"], value="error")
113
129
  urls_manager.touch_url(url=url["url"])
114
- print("\n\n!!! ERROR FOR:", url["url"])
115
- print(
116
- "\n\n!!! you can check the URL using the command sniff-url",
117
- url["url"],
118
- "\n\n",
119
- )
130
+ if verbose:
131
+ print("\n\n!!! ERROR FOR:", url["url"])
132
+ print(
133
+ "\n\n!!! you can check the URL using the command sniff-url",
134
+ url["url"],
135
+ "\n\n",
136
+ )
120
137
  return
121
138
 
122
- # linkedin_redirect - linkedin (https://lnkd.in/)
123
139
  if url["url_type"] == "linkedin_redirect":
124
- process_linkedin_redirect(url_report=url_report, url=url)
140
+ process_linkedin_redirect(url_report=url_report, url=url, verbose=verbose)
125
141
 
126
- # linkedin_feed - linkedin (https://%.linkedin.com/feed/)
127
142
  if url["url_type"] == "linkedin_feed":
128
- process_linkedin_feed(url_report=url_report, url=url)
143
+ process_linkedin_feed(url_report=url_report, url=url, verbose=verbose)
129
144
 
130
- # linkedin_job - linkedin (https://www.linkedin.com/jobs/)
131
145
  if url["url_type"] == "linkedin_job":
132
- process_linkedin_job(url_report=url_report, url=url)
146
+ process_linkedin_job(url_report=url_report, url=url, verbose=verbose)
133
147
 
134
- # linkedin_job - linkedin (https://www.linkedin.com/jobs/)
135
148
  if url["url_type"] == "linkedin_post" or url["url_type"] == "generic":
136
- process_linkedin_post(url_report=url_report, url=url)
149
+ process_linkedin_post(url_report=url_report, url=url, verbose=verbose)
137
150
 
138
151
  urls_manager.set_url_json(url=url["url"], value=url_report["json"])
139
152
  urls_manager.touch_url(url=url["url"])
@@ -144,35 +157,53 @@ def isNaN(num):
144
157
 
145
158
 
146
159
  def scrap_urls(
147
- recursive=False, ignore_valid_prefix=False, randomize=False, only_parents=True
160
+ recursive=False,
161
+ ignore_valid_prefix=False,
162
+ randomize=False,
163
+ only_parents=True,
164
+ verbose=False,
165
+ n_urls=0,
148
166
  ):
167
+ limit = 10
149
168
  classify_urls.classify_urls()
150
169
  urls = urls_manager.get_untouched_urls(
151
170
  ignore_valid_prefix=ignore_valid_prefix,
152
171
  randomize=randomize,
153
172
  only_parents=only_parents,
173
+ limit=limit,
154
174
  )
155
175
  if len(urls) == 0:
156
- print("no urls to scrap")
176
+ print("📭 no urls to scrap")
177
+ if n_urls > 0:
178
+ print(f"-- 🗃️ {n_urls} scraped urls in total...")
179
+ print("scrapping is over...")
157
180
  return
158
181
  for index, url in urls.iterrows():
159
- scrap_url(url)
160
-
161
- wait = random.randint(15, 20)
162
182
  wait = random.randint(1, 3)
163
- print("sleeping for", wait, "seconds")
183
+ print(
184
+ "🐶 Scrapper is sleeping for", wait, "seconds before scraping next url..."
185
+ )
164
186
  time.sleep(wait)
165
187
 
188
+ print("🐕 Scrapper is sniffing the url...")
189
+ scrap_url(url=url, verbose=verbose)
190
+
191
+ n_urls = n_urls + len(urls)
192
+ print(f"-- 🗃️ {n_urls} scraped urls...")
166
193
  classify_urls.classify_urls()
167
194
  if recursive:
168
195
  wait = random.randint(5, 10)
169
- print("sleeping for", wait, "seconds before next round")
196
+ print(
197
+ f"🐶 Scrapper is sleeping for {wait} seconds before next round of {limit} urls"
198
+ )
170
199
  time.sleep(wait)
171
200
  scrap_urls(
172
201
  recursive=recursive,
173
202
  ignore_valid_prefix=ignore_valid_prefix,
174
203
  randomize=randomize,
175
204
  only_parents=only_parents,
205
+ verbose=verbose,
206
+ n_urls=n_urls,
176
207
  )
177
208
  else:
178
- print("ending...")
209
+ print("scrapping is over...")
@@ -1,7 +1,33 @@
1
1
  import ohmyscrapper.models.urls_manager as urls_manager
2
+ from ohmyscrapper.core import config
2
3
 
3
4
 
4
5
  def seed():
5
- urls_manager.seeds()
6
- print("db seeded")
6
+ if not config.url_types_file_exists():
7
+ db_url_types = urls_manager.get_urls_valid_prefix()
8
+ if len(db_url_types) > 0:
9
+ export_url_types_to_file()
10
+ print("🪹 you have a new `url_types.yaml` based on your db! =)")
11
+ return
12
+
13
+ seeds = get_url_types_from_file()
14
+
15
+ if len(seeds) > 0:
16
+ urls_manager.seeds(seeds=seeds)
17
+ print("🫒 db seeded")
7
18
  return
19
+
20
+
21
+ def get_url_types_from_file():
22
+ url_types_from_file = config.get_url_types()
23
+ if url_types_from_file is None:
24
+ url_types_from_file = {}
25
+ return url_types_from_file
26
+
27
+
28
+ def export_url_types_to_file():
29
+ url_types = urls_manager.get_urls_valid_prefix()
30
+ yaml_url_types = {}
31
+ for index, url_type in url_types.iterrows():
32
+ yaml_url_types[url_type["url_type"]] = url_type["url_prefix"]
33
+ config.append_url_types(yaml_url_types)
@@ -1,10 +1,14 @@
1
1
  import ohmyscrapper.models.urls_manager as urls_manager
2
+ from ohmyscrapper.core import config
2
3
  import math
4
+ import os
3
5
  from rich.console import Console
4
6
  from rich.table import Table
5
7
 
6
8
 
7
9
  def export_urls(limit=0, csv_file="output/urls.csv", simplify=False):
10
+ output_folder = config.get_dir("output")
11
+
8
12
  df = urls_manager.get_urls(limit=limit)
9
13
 
10
14
  if simplify:
@@ -12,27 +16,31 @@ def export_urls(limit=0, csv_file="output/urls.csv", simplify=False):
12
16
 
13
17
  df.to_csv(csv_file, index=False)
14
18
  print("--------------------")
15
- print("Urls exported to", csv_file)
16
-
17
- df.replace(
18
- {
19
- "description": {r"\n": " "},
20
- },
21
- regex=True,
22
- inplace=True,
23
- )
19
+ print("📊🖋️ Urls exported to", csv_file)
20
+ if "description" in df:
21
+ try:
22
+ df.replace(
23
+ {
24
+ "description": {r"\n": " "},
25
+ },
26
+ regex=True,
27
+ inplace=True,
28
+ )
29
+ except:
30
+ pass
24
31
  df.to_html(csv_file + "-preview.html", index=False)
25
- print("Urls preview exported to", csv_file + "-preview.html")
32
+ print("📜🖋️ Urls preview exported to", csv_file + "-preview.html")
26
33
  print("--------------------")
27
34
 
28
35
 
29
36
  def export_report(csv_file="output/report.csv"):
37
+ output_folder = config.get_dir("output")
30
38
  df = urls_manager.get_urls_report()
31
39
 
32
40
  df.to_csv(csv_file, index=False)
33
41
  _clear_file(csv_file)
34
42
  print("--------------------")
35
- print("Urls report exported to", csv_file)
43
+ print("📊🖋️ Urls report exported to", csv_file)
36
44
 
37
45
  df.replace(
38
46
  {
@@ -44,9 +52,10 @@ def export_report(csv_file="output/report.csv"):
44
52
  df.to_html(csv_file + "-preview.html", index=False)
45
53
  _clear_file(csv_file + "-preview.html")
46
54
 
47
- print("Urls report preview exported to", csv_file + "-preview.html")
55
+ print("📜🖋️ Urls report preview exported to", csv_file + "-preview.html")
48
56
  print("--------------------")
49
57
 
58
+
50
59
  # TODO: Add transformation layer
51
60
  def _clear_file(txt_tile):
52
61
  with open(txt_tile, "r") as f:
@@ -56,6 +65,7 @@ def _clear_file(txt_tile):
56
65
  with open(txt_tile, "w") as f:
57
66
  f.write(content)
58
67
 
68
+
59
69
  def show_urls(limit=0, jump_to_page=0):
60
70
  df = urls_manager.get_urls(limit=limit)
61
71
  df.drop(columns=["json", "description"], inplace=True)
@@ -100,8 +110,6 @@ def show_urls(limit=0, jump_to_page=0):
100
110
 
101
111
  return
102
112
 
103
- return
104
-
105
113
 
106
114
  # TODO: Change place
107
115
  def show_table(df):