ohmyscrapper 0.2.3__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  import ohmyscrapper.models.urls_manager as urls_manager
2
+ from ohmyscrapper.core import config
2
3
  from bs4 import BeautifulSoup
3
4
  from google import genai
4
5
  from dotenv import load_dotenv
@@ -7,9 +8,11 @@ import time
7
8
  import os
8
9
  import yaml
9
10
  import json
11
+
10
12
  # TODO: !!! REFACTOR !!!
11
13
  load_dotenv()
12
14
 
15
+
13
16
  def reprocess_ai_history():
14
17
  df = urls_manager.get_ai_log().to_dict(orient="records")
15
18
  for row in df:
@@ -17,28 +20,34 @@ def reprocess_ai_history():
17
20
 
18
21
 
19
22
  def process_ai_response(response):
20
- job_positions = xml2dict(response)
21
-
22
- for index, xml_item_children in job_positions.items():
23
- for url_child_xml in xml_item_children:
24
-
25
- url_parent = urls_manager.get_url_by_id(url_child_xml["id"])
26
- if len(url_parent) > 0:
27
- url_parent = url_parent.iloc[0]
28
- h1 = url_child_xml.copy()
29
- del h1["id"]
30
- del h1["url"]
31
- h1 = " - ".join(h1.values())
32
- if url_parent["description_links"] > 1 and url_child_xml["id"] != "":
33
- print("-- child updated -- \n", url_child_xml["url"] , ":", h1)
34
- urls_manager.set_url_h1(url_child_xml["url"], h1)
35
- urls_manager.set_url_ai_processed_by_url(url_child_xml["url"], str(json.dumps(url_child_xml)))
36
- if url_parent["url"] != url_child_xml["url"]:
37
- urls_manager.set_url_ai_processed_by_url(url_parent["url"], "children-update")
38
- else:
39
- print("-- parent updated -- \n", url_parent["url"], ":", h1)
40
- urls_manager.set_url_h1(url_parent["url"], h1)
41
- urls_manager.set_url_ai_processed_by_url(url_parent["url"], str(json.dumps(url_child_xml)))
23
+ job_positions = xml2dict(response)
24
+
25
+ for index, xml_item_children in job_positions.items():
26
+ for url_child_xml in xml_item_children:
27
+
28
+ url_parent = urls_manager.get_url_by_id(url_child_xml["id"])
29
+ if len(url_parent) > 0:
30
+ url_parent = url_parent.iloc[0]
31
+ title = url_child_xml.copy()
32
+ del title["id"]
33
+ del title["url"]
34
+ title = " - ".join(title.values())
35
+ if url_parent["description_links"] > 1 and url_child_xml["id"] != "":
36
+ print("-- child updated -- \n", url_child_xml["url"], ":", title)
37
+ urls_manager.set_url_title(url_child_xml["url"], title)
38
+ urls_manager.set_url_ai_processed_by_url(
39
+ url_child_xml["url"], str(json.dumps(url_child_xml))
40
+ )
41
+ if url_parent["url"] != url_child_xml["url"]:
42
+ urls_manager.set_url_ai_processed_by_url(
43
+ url_parent["url"], "children-update"
44
+ )
45
+ else:
46
+ print("-- parent updated -- \n", url_parent["url"], ":", title)
47
+ urls_manager.set_url_title(url_parent["url"], title)
48
+ urls_manager.set_url_ai_processed_by_url(
49
+ url_parent["url"], str(json.dumps(url_child_xml))
50
+ )
42
51
 
43
52
 
44
53
  def xml2dict(xml_string):
@@ -46,19 +55,21 @@ def xml2dict(xml_string):
46
55
 
47
56
  children_items_dict = {}
48
57
  for item in soup.find_all():
49
- if(item.parent.name == "[document]"):
58
+ if item.parent.name == "[document]":
50
59
  children_items_dict[item.name] = []
51
60
  elif item.parent.name in children_items_dict:
52
61
  children_items_dict[item.parent.name].append(_xml_children_to_dict(item))
53
62
 
54
63
  return children_items_dict
55
64
 
65
+
56
66
  def _xml_children_to_dict(xml):
57
67
  item_dict = {}
58
68
  for item in xml.find_all():
59
69
  item_dict[item.name] = item.text
60
70
  return item_dict
61
71
 
72
+
62
73
  def process_with_ai(recursive=True, triggered_times=0):
63
74
  triggered_times = triggered_times + 1
64
75
 
@@ -91,13 +102,23 @@ def process_with_ai(recursive=True, triggered_times=0):
91
102
  print("prompt:", prompt["name"])
92
103
  print("model:", prompt["model"])
93
104
  print("description:", prompt["description"])
94
- prompt["instructions"] = prompt["instructions"].replace("{ohmyscrapper_texts}", texts)
105
+ prompt["instructions"] = prompt["instructions"].replace(
106
+ "{ohmyscrapper_texts}", texts
107
+ )
95
108
 
96
109
  # The client gets the API key from the environment variable `GEMINI_API_KEY`.
97
110
  client = genai.Client()
98
- response = client.models.generate_content(model=prompt["model"], contents=prompt["instructions"])
111
+ response = client.models.generate_content(
112
+ model=prompt["model"], contents=prompt["instructions"]
113
+ )
99
114
  response = str(response.text)
100
- urls_manager.add_ai_log(instructions=prompt["instructions"], response=response, model=prompt["model"], prompt_name=prompt["name"], prompt_file=prompt["prompt_file"])
115
+ urls_manager.add_ai_log(
116
+ instructions=prompt["instructions"],
117
+ response=response,
118
+ model=prompt["model"],
119
+ prompt_name=prompt["name"],
120
+ prompt_file=prompt["prompt_file"],
121
+ )
101
122
  print(response)
102
123
  print("^^^^^^")
103
124
  process_ai_response(response=response)
@@ -114,7 +135,9 @@ def process_with_ai(recursive=True, triggered_times=0):
114
135
  if triggered_times > 5:
115
136
  print("!!! This is a break to prevent budget accident$.")
116
137
  print("You triggered", triggered_times, "times the AI processing function.")
117
- print("If you are sure this is correct, you can re-call this function again.")
138
+ print(
139
+ "If you are sure this is correct, you can re-call this function again."
140
+ )
118
141
  print("Please, check it.")
119
142
  return
120
143
 
@@ -122,8 +145,13 @@ def process_with_ai(recursive=True, triggered_times=0):
122
145
 
123
146
  return
124
147
 
148
+
125
149
  def _get_prompt():
126
- prompts_path = "prompts"
150
+ prompts_path = config.get_dir(param="prompts")
151
+ default_prommpt_file = os.path.join(
152
+ prompts_path, config.get_ai("default_prompt_file")
153
+ )
154
+
127
155
  default_prompt = """---
128
156
  model: "gemini-2.5-flash"
129
157
  name: "default-prompt"
@@ -133,15 +161,18 @@ Process with AI this prompt: {ohmyscrapper_texts}
133
161
  """
134
162
  if not os.path.exists(prompts_path):
135
163
  os.mkdir(prompts_path)
136
-
137
- open(f"{prompts_path}/prompt.md", "w").write(default_prompt)
138
- print(f"You didn't have a prompt file. One was created in the /{prompts_path} folder. You can change it there.")
164
+ open(default_prommpt_file, "w").write(default_prompt)
165
+ print(
166
+ f"You didn't have a prompt file. One was created in the /{prompts_path} folder. You can change it there."
167
+ )
139
168
  return False
140
169
 
141
170
  prompt_files = os.listdir(prompts_path)
142
171
  if len(prompt_files) == 0:
143
- open(f"{prompts_path}/prompt.md", "w").write(default_prompt)
144
- print(f"You didn't have a prompt file. One was created in the /{prompts_path} folder. You can change it there.")
172
+ open(default_prommpt_file, "w").write(default_prompt)
173
+ print(
174
+ f"You didn't have a prompt file. One was created in the /{prompts_path} folder. You can change it there."
175
+ )
145
176
  return False
146
177
  prompt = {}
147
178
  if len(prompt_files) == 1:
@@ -151,8 +182,10 @@ Process with AI this prompt: {ohmyscrapper_texts}
151
182
  prompts = {}
152
183
  for index, file in enumerate(prompt_files):
153
184
  prompts[index] = _parse_prompt(prompts_path=prompts_path, prompt_file=file)
154
- print(index, ":", prompts[index]['name'])
155
- input_prompt = input("Type the number of the prompt you want to use or 'q' to quit: ")
185
+ print(index, ":", prompts[index]["name"])
186
+ input_prompt = input(
187
+ "Type the number of the prompt you want to use or 'q' to quit: "
188
+ )
156
189
  if input_prompt == "q":
157
190
  return False
158
191
  try:
@@ -162,14 +195,17 @@ Process with AI this prompt: {ohmyscrapper_texts}
162
195
  prompt = _get_prompt()
163
196
  return prompt
164
197
 
198
+
165
199
  def _parse_prompt(prompts_path, prompt_file):
166
200
  prompt = {}
167
- raw_prompt = open(f"{prompts_path}/{prompt_file}", "r").read().split("---")
201
+ raw_prompt = open(os.path.join(prompts_path, prompt_file), "r").read().split("---")
168
202
  prompt = yaml.safe_load(raw_prompt[1])
169
203
  prompt["instructions"] = raw_prompt[2].strip()
170
204
  prompt["prompt_file"] = prompt_file
171
205
 
172
206
  return prompt
207
+
208
+
173
209
  # TODO: Separate gemini from basic function
174
210
  def _process_with_gemini(model, instructions):
175
211
  response = """"""
@@ -2,141 +2,132 @@ import ohmyscrapper.models.urls_manager as urls_manager
2
2
  import ohmyscrapper.modules.sniff_url as sniff_url
3
3
  import ohmyscrapper.modules.load_txt as load_txt
4
4
  import ohmyscrapper.modules.classify_urls as classify_urls
5
+ from ohmyscrapper.core import config
5
6
 
6
7
  import time
7
8
  import random
8
9
 
9
10
 
10
- def process_linkedin_redirect(url_report, url):
11
- print("linkedin_redirect")
12
-
13
- if url_report["total-a-links"] < 5:
14
- if "first-a-link" in url_report.keys():
15
- url_destiny = url_report["first-a-link"]
16
- else:
17
- urls_manager.set_url_error(url=url["url"], value="error: no first-a-link")
18
- print("no url for:", url["url"])
19
- return
20
- else:
21
- if "og:url" in url_report.keys():
22
- url_destiny = url_report["og:url"]
23
- else:
24
- urls_manager.set_url_error(url=url["url"], value="error: no og:url")
25
- print("no url for:", url["url"])
26
- return
11
+ def scrap_url(url, verbose=False):
12
+ if url["url_type"] is None:
13
+ url["url_type"] = "generic"
27
14
 
28
- print(url["url"], ">>", url_destiny)
29
- urls_manager.add_url(url=url_destiny)
30
- urls_manager.set_url_destiny(url=url["url"], destiny=url_destiny)
15
+ if verbose:
16
+ print("\n\n", url["url_type"] + ":", url["url"])
31
17
 
18
+ try:
19
+ url_type = url["url_type"]
20
+ sniffing_config = config.get_url_sniffing()
21
+
22
+ if url_type not in sniffing_config:
23
+ default_type_sniffing = {
24
+ "bodytags": [{"h1": "title"}],
25
+ "metatags": [
26
+ {"og:title": "title"},
27
+ {"og:description": "description"},
28
+ {"description": "description"},
29
+ ],
30
+ }
31
+ config.append_url_sniffing({url_type: default_type_sniffing})
32
+ sniffing_config = config.get_url_sniffing()
33
+
34
+ url_report = sniff_url.get_tags(
35
+ url=url["url"], sniffing_config=sniffing_config[url_type]
36
+ )
37
+ except Exception as e:
38
+ urls_manager.set_url_error(url=url["url"], value="error on scrapping")
39
+ urls_manager.touch_url(url=url["url"])
40
+ if verbose:
41
+ print("\n\n!!! ERROR FOR:", url["url"])
42
+ print(
43
+ "\n\n!!! you can check the URL using the command sniff-url",
44
+ url["url"],
45
+ "\n\n",
46
+ )
47
+ return
32
48
 
33
- def process_linkedin_feed(url_report, url):
34
- print("linkedin_feed")
49
+ process_sniffed_url(
50
+ url_report=url_report,
51
+ url=url,
52
+ sniffing_config=sniffing_config[url_type],
53
+ verbose=verbose,
54
+ )
35
55
 
36
- if "og:url" in url_report.keys():
37
- url_destiny = url_report["og:url"]
38
- else:
39
- urls_manager.set_url_error(url=url["url"], value="error: no og:url")
40
- print("no url for:", url["url"])
41
- return
56
+ urls_manager.set_url_json(url=url["url"], value=url_report["json"])
57
+ urls_manager.touch_url(url=url["url"])
42
58
 
43
- print(url["url"], ">>", url_destiny)
44
- urls_manager.add_url(url=url_destiny)
45
- urls_manager.set_url_destiny(url=url["url"], destiny=url_destiny)
59
+ return
46
60
 
47
61
 
48
- def process_linkedin_job(url_report, url):
49
- print("linkedin_job")
62
+ def process_sniffed_url(url_report, url, sniffing_config, verbose=False):
63
+ if verbose:
64
+ print(url["url_type"])
65
+ print(url["url"])
50
66
  changed = False
51
- if "h1" in url_report.keys():
52
- print(url["url"], ": ", url_report["h1"])
53
- urls_manager.set_url_h1(url=url["url"], value=url_report["h1"])
54
- changed = True
55
- elif "og:title" in url_report.keys():
56
- print(url["url"], ": ", url_report["og:title"])
57
- urls_manager.set_url_h1(url=url["url"], value=url_report["og:title"])
58
- changed = True
59
67
 
60
- if "description" in url_report.keys():
61
- urls_manager.set_url_description(url=url["url"], value=url_report["description"])
62
- changed = True
63
- elif "og:description" in url_report.keys():
64
- urls_manager.set_url_description(url=url["url"], value=url_report["og:description"])
68
+ db_fields = {}
69
+ db_fields["title"] = None
70
+ db_fields["description"] = None
71
+ db_fields["url_destiny"] = None
72
+
73
+ if "metatags" in sniffing_config.keys():
74
+ for tag, bd_field in sniffing_config["metatags"].items():
75
+ if tag in url_report.keys():
76
+ if bd_field[:1] == "+":
77
+ if db_fields[bd_field[1:]] is None:
78
+ db_fields[bd_field[1:]] = ""
79
+ db_fields[bd_field[1:]] = (
80
+ db_fields[bd_field[1:]] + " " + url_report[tag]
81
+ )
82
+ else:
83
+ db_fields[bd_field] = url_report[tag]
84
+
85
+ if "bodytags" in sniffing_config.keys():
86
+ for tag, bd_field in sniffing_config["bodytags"].items():
87
+ if tag in url_report.keys():
88
+ if bd_field[:1] == "+":
89
+ if db_fields[bd_field[1:]] is None:
90
+ db_fields[bd_field[1:]] = ""
91
+ db_fields[bd_field[1:]] = (
92
+ db_fields[bd_field[1:]] + " " + url_report[tag]
93
+ )
94
+ else:
95
+ db_fields[bd_field] = url_report[tag]
96
+
97
+ if (
98
+ "atags" in sniffing_config.keys()
99
+ and "first-tag-as-url_destiny" in sniffing_config["atags"].keys()
100
+ ):
101
+ if (
102
+ url_report["total-a-links"]
103
+ < sniffing_config["atags"]["first-tag-as-url_destiny"]
104
+ ):
105
+ if "first-a-link" in url_report.keys():
106
+ db_fields["url_destiny"] = url_report["first-a-link"]
107
+
108
+ if db_fields["title"] is not None:
109
+ urls_manager.set_url_title(url=url["url"], value=db_fields["title"])
65
110
  changed = True
66
- if not changed:
67
- urls_manager.set_url_error(url=url["url"], value="error: no h1 or description")
68
111
 
112
+ if db_fields["description"] is not None:
113
+ urls_manager.set_url_description(url=url["url"], value=db_fields["description"])
114
+ description_links = load_txt.put_urls_from_string(
115
+ text_to_process=db_fields["description"], parent_url=url["url"]
116
+ )
117
+ urls_manager.set_url_description_links(url=url["url"], value=description_links)
69
118
 
70
- def process_linkedin_post(url_report, url):
71
- print("linkedin_post or generic")
72
- print(url["url"])
73
- changed = False
74
- if "h1" in url_report.keys():
75
- print(url["url"], ": ", url_report["h1"])
76
- urls_manager.set_url_h1(url=url["url"], value=url_report["h1"])
77
- changed = True
78
- elif "og:title" in url_report.keys():
79
- urls_manager.set_url_h1(url=url["url"], value=url_report["og:title"])
80
- changed = True
81
- description = None
82
- if "description" in url_report.keys():
83
- description = url_report["description"]
84
- changed = True
85
- elif "og:description" in url_report.keys():
86
- description = url_report["og:description"]
87
119
  changed = True
88
120
 
89
- if description is not None:
90
- urls_manager.set_url_description(url=url["url"], value=description)
91
- description_links = load_txt.put_urls_from_string(text_to_process=description, parent_url=url["url"])
92
- urls_manager.set_url_description_links(url=url["url"], value=description_links)
121
+ if db_fields["url_destiny"] is not None:
122
+ urls_manager.add_url(url=db_fields["url_destiny"])
123
+ urls_manager.set_url_destiny(url=url["url"], destiny=db_fields["url_destiny"])
124
+ changed = True
93
125
 
94
126
  if not changed:
95
- urls_manager.set_url_error(url=url["url"], value="error: no h1 or description")
96
-
97
-
98
- def scrap_url(url):
99
- # TODO: Use get_urls_valid_prefix_by_id()
100
- df = urls_manager.get_urls_valid_prefix()
101
-
102
- # TODO: Need to change this
103
-
104
- if url["url_type"] is None:
105
- print("\n\ngeneric:", url["url"])
106
- url["url_type"] = "generic"
107
- else:
108
- print("\n\n", url["url_type"] + ":", url["url"])
109
- try:
110
- url_report = sniff_url.get_tags(url=url["url"])
111
- except Exception as e:
112
- urls_manager.set_url_error(url=url["url"], value="error")
113
- urls_manager.touch_url(url=url["url"])
114
- print("\n\n!!! ERROR FOR:", url["url"])
115
- print(
116
- "\n\n!!! you can check the URL using the command sniff-url",
117
- url["url"],
118
- "\n\n",
127
+ urls_manager.set_url_error(
128
+ url=url["url"],
129
+ value="error: no title, url_destiny or description was founded",
119
130
  )
120
- return
121
-
122
- # linkedin_redirect - linkedin (https://lnkd.in/)
123
- if url["url_type"] == "linkedin_redirect":
124
- process_linkedin_redirect(url_report=url_report, url=url)
125
-
126
- # linkedin_feed - linkedin (https://%.linkedin.com/feed/)
127
- if url["url_type"] == "linkedin_feed":
128
- process_linkedin_feed(url_report=url_report, url=url)
129
-
130
- # linkedin_job - linkedin (https://www.linkedin.com/jobs/)
131
- if url["url_type"] == "linkedin_job":
132
- process_linkedin_job(url_report=url_report, url=url)
133
-
134
- # linkedin_job - linkedin (https://www.linkedin.com/jobs/)
135
- if url["url_type"] == "linkedin_post" or url["url_type"] == "generic":
136
- process_linkedin_post(url_report=url_report, url=url)
137
-
138
- urls_manager.set_url_json(url=url["url"], value=url_report["json"])
139
- urls_manager.touch_url(url=url["url"])
140
131
 
141
132
 
142
133
  def isNaN(num):
@@ -144,35 +135,53 @@ def isNaN(num):
144
135
 
145
136
 
146
137
  def scrap_urls(
147
- recursive=False, ignore_valid_prefix=False, randomize=False, only_parents=True
138
+ recursive=False,
139
+ ignore_valid_prefix=False,
140
+ randomize=False,
141
+ only_parents=True,
142
+ verbose=False,
143
+ n_urls=0,
148
144
  ):
145
+ limit = 10
149
146
  classify_urls.classify_urls()
150
147
  urls = urls_manager.get_untouched_urls(
151
148
  ignore_valid_prefix=ignore_valid_prefix,
152
149
  randomize=randomize,
153
150
  only_parents=only_parents,
151
+ limit=limit,
154
152
  )
155
153
  if len(urls) == 0:
156
- print("no urls to scrap")
154
+ print("📭 no urls to scrap")
155
+ if n_urls > 0:
156
+ print(f"-- 🗃️ {n_urls} scraped urls in total...")
157
+ print("scrapping is over...")
157
158
  return
158
159
  for index, url in urls.iterrows():
159
- scrap_url(url)
160
-
161
- wait = random.randint(15, 20)
162
160
  wait = random.randint(1, 3)
163
- print("sleeping for", wait, "seconds")
161
+ print(
162
+ "🐶 Scrapper is sleeping for", wait, "seconds before scraping next url..."
163
+ )
164
164
  time.sleep(wait)
165
165
 
166
+ print("🐕 Scrapper is sniffing the url...")
167
+ scrap_url(url=url, verbose=verbose)
168
+
169
+ n_urls = n_urls + len(urls)
170
+ print(f"-- 🗃️ {n_urls} scraped urls...")
166
171
  classify_urls.classify_urls()
167
172
  if recursive:
168
173
  wait = random.randint(5, 10)
169
- print("sleeping for", wait, "seconds before next round")
174
+ print(
175
+ f"🐶 Scrapper is sleeping for {wait} seconds before next round of {limit} urls"
176
+ )
170
177
  time.sleep(wait)
171
178
  scrap_urls(
172
179
  recursive=recursive,
173
180
  ignore_valid_prefix=ignore_valid_prefix,
174
181
  randomize=randomize,
175
182
  only_parents=only_parents,
183
+ verbose=verbose,
184
+ n_urls=n_urls,
176
185
  )
177
186
  else:
178
- print("ending...")
187
+ print("scrapping is over...")
@@ -1,7 +1,33 @@
1
1
  import ohmyscrapper.models.urls_manager as urls_manager
2
+ from ohmyscrapper.core import config
2
3
 
3
4
 
4
5
  def seed():
5
- urls_manager.seeds()
6
- print("db seeded")
6
+ if not config.url_types_file_exists():
7
+ db_url_types = urls_manager.get_urls_valid_prefix()
8
+ if len(db_url_types) > 0:
9
+ export_url_types_to_file()
10
+ print("🪹 you have a new `url_types.yaml` based on your db! =)")
11
+ return
12
+
13
+ seeds = get_url_types_from_file()
14
+
15
+ if len(seeds) > 0:
16
+ urls_manager.seeds(seeds=seeds)
17
+ print("🫒 db seeded")
7
18
  return
19
+
20
+
21
+ def get_url_types_from_file():
22
+ url_types_from_file = config.get_url_types()
23
+ if url_types_from_file is None:
24
+ url_types_from_file = {}
25
+ return url_types_from_file
26
+
27
+
28
+ def export_url_types_to_file():
29
+ url_types = urls_manager.get_urls_valid_prefix()
30
+ yaml_url_types = {}
31
+ for index, url_type in url_types.iterrows():
32
+ yaml_url_types[url_type["url_type"]] = url_type["url_prefix"]
33
+ config.append_url_types(yaml_url_types)
@@ -1,10 +1,14 @@
1
1
  import ohmyscrapper.models.urls_manager as urls_manager
2
+ from ohmyscrapper.core import config
2
3
  import math
4
+ import os
3
5
  from rich.console import Console
4
6
  from rich.table import Table
5
7
 
6
8
 
7
9
  def export_urls(limit=0, csv_file="output/urls.csv", simplify=False):
10
+ output_folder = config.get_dir("output")
11
+
8
12
  df = urls_manager.get_urls(limit=limit)
9
13
 
10
14
  if simplify:
@@ -12,27 +16,31 @@ def export_urls(limit=0, csv_file="output/urls.csv", simplify=False):
12
16
 
13
17
  df.to_csv(csv_file, index=False)
14
18
  print("--------------------")
15
- print("Urls exported to", csv_file)
16
-
17
- df.replace(
18
- {
19
- "description": {r"\n": " "},
20
- },
21
- regex=True,
22
- inplace=True,
23
- )
19
+ print("📊🖋️ Urls exported to", csv_file)
20
+ if "description" in df:
21
+ try:
22
+ df.replace(
23
+ {
24
+ "description": {r"\n": " "},
25
+ },
26
+ regex=True,
27
+ inplace=True,
28
+ )
29
+ except:
30
+ pass
24
31
  df.to_html(csv_file + "-preview.html", index=False)
25
- print("Urls preview exported to", csv_file + "-preview.html")
32
+ print("📜🖋️ Urls preview exported to", csv_file + "-preview.html")
26
33
  print("--------------------")
27
34
 
28
35
 
29
36
  def export_report(csv_file="output/report.csv"):
37
+ output_folder = config.get_dir("output")
30
38
  df = urls_manager.get_urls_report()
31
39
 
32
40
  df.to_csv(csv_file, index=False)
33
41
  _clear_file(csv_file)
34
42
  print("--------------------")
35
- print("Urls report exported to", csv_file)
43
+ print("📊🖋️ Urls report exported to", csv_file)
36
44
 
37
45
  df.replace(
38
46
  {
@@ -44,9 +52,10 @@ def export_report(csv_file="output/report.csv"):
44
52
  df.to_html(csv_file + "-preview.html", index=False)
45
53
  _clear_file(csv_file + "-preview.html")
46
54
 
47
- print("Urls report preview exported to", csv_file + "-preview.html")
55
+ print("📜🖋️ Urls report preview exported to", csv_file + "-preview.html")
48
56
  print("--------------------")
49
57
 
58
+
50
59
  # TODO: Add transformation layer
51
60
  def _clear_file(txt_tile):
52
61
  with open(txt_tile, "r") as f:
@@ -56,6 +65,7 @@ def _clear_file(txt_tile):
56
65
  with open(txt_tile, "w") as f:
57
66
  f.write(content)
58
67
 
68
+
59
69
  def show_urls(limit=0, jump_to_page=0):
60
70
  df = urls_manager.get_urls(limit=limit)
61
71
  df.drop(columns=["json", "description"], inplace=True)
@@ -100,8 +110,6 @@ def show_urls(limit=0, jump_to_page=0):
100
110
 
101
111
  return
102
112
 
103
- return
104
-
105
113
 
106
114
  # TODO: Change place
107
115
  def show_table(df):