ohmyscrapper 0.3.4__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ohmyscrapper
3
- Version: 0.3.4
3
+ Version: 0.4.0
4
4
  Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
5
5
  Author: Cesar Cardoso
6
6
  Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
@@ -16,7 +16,7 @@ Requires-Dist: urlextract>=1.9.0
16
16
  Requires-Python: >=3.11
17
17
  Description-Content-Type: text/markdown
18
18
 
19
- # 🐶 OhMyScrapper - v0.3.4
19
+ # 🐶 OhMyScrapper - v0.4.0
20
20
 
21
21
  OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
22
22
  final report with general information about job positions.
@@ -78,10 +78,16 @@ in this folder and use the command `load`:
78
78
  ```shell
79
79
  ohmyscrapper load
80
80
  ```
81
- or, if you have another file in a different folder, just use the argument `-file` like this:
81
+ or, if you have another file in a different folder, just use the argument `-input` like this:
82
82
  ```shell
83
- ohmyscrapper load -file=my-text-file.txt
83
+ ohmyscrapper load -input=my-text-file.txt
84
84
  ```
85
+ In this case, you can add an url directly to the database, like this:
86
+ ```shell
87
+ ohmyscrapper load -input=https://cesarcardoso.cc/
88
+ ```
89
+ That will append the last url in the database to be scraped.
90
+
85
91
  That will create a database if it doesn't exist and store every url the oh-my-scrapper
86
92
  find. After that, let's scrap the urls with the command `scrap-urls`:
87
93
 
@@ -1,4 +1,4 @@
1
- # 🐶 OhMyScrapper - v0.3.4
1
+ # 🐶 OhMyScrapper - v0.4.0
2
2
 
3
3
  OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
4
4
  final report with general information about job positions.
@@ -60,10 +60,16 @@ in this folder and use the command `load`:
60
60
  ```shell
61
61
  ohmyscrapper load
62
62
  ```
63
- or, if you have another file in a different folder, just use the argument `-file` like this:
63
+ or, if you have another file in a different folder, just use the argument `-input` like this:
64
64
  ```shell
65
- ohmyscrapper load -file=my-text-file.txt
65
+ ohmyscrapper load -input=my-text-file.txt
66
66
  ```
67
+ In this case, you can add an url directly to the database, like this:
68
+ ```shell
69
+ ohmyscrapper load -input=https://cesarcardoso.cc/
70
+ ```
71
+ That will append the last url in the database to be scraped.
72
+
67
73
  That will create a database if it doesn't exist and store every url the oh-my-scrapper
68
74
  find. After that, let's scrap the urls with the command `scrap-urls`:
69
75
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ohmyscrapper"
3
- version = "0.3.4"
3
+ version = "0.4.0"
4
4
  description = "OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions."
5
5
  readme = "README.md"
6
6
  authors = [
@@ -19,7 +19,7 @@ from ohmyscrapper.modules.merge_dbs import merge_dbs
19
19
 
20
20
  def main():
21
21
  parser = argparse.ArgumentParser(prog="ohmyscrapper")
22
- parser.add_argument("--version", action="version", version="%(prog)s v0.3.4")
22
+ parser.add_argument("--version", action="version", version="%(prog)s v0.4.0")
23
23
 
24
24
  subparsers = parser.add_subparsers(dest="command", help="Available commands")
25
25
  start_parser = subparsers.add_parser(
@@ -54,7 +54,7 @@ def main():
54
54
  )
55
55
 
56
56
  load_txt_parser = subparsers.add_parser("load", help="📦 Load txt file")
57
- load_txt_parser.add_argument("-file", default=None, help="File path.")
57
+ load_txt_parser.add_argument("-input", default=None, help="File path or url.")
58
58
  load_txt_parser.add_argument(
59
59
  "--verbose", default=False, help="Run in verbose mode", action="store_true"
60
60
  )
@@ -114,7 +114,7 @@ def main():
114
114
  return
115
115
 
116
116
  if args.command == "load":
117
- load_txt(file_name=args.file, verbose=args.verbose)
117
+ load_txt(file_name=args.input, verbose=args.verbose)
118
118
  return
119
119
 
120
120
  if args.command == "seed":
@@ -22,11 +22,15 @@ def load_txt(file_name=None, verbose=False):
22
22
  if file_name is not None:
23
23
  print(f"📖 reading file `{file_name}`... ")
24
24
  if not os.path.exists(file_name):
25
- print(f"\n file `{file_name}` not found.")
26
- return
27
- text_file_content = _increment_file_name(
28
- text_file_content=text_file_content, file_name=file_name
29
- )
25
+ if file_name.startswith("https://") or file_name.startswith("http://"):
26
+ text_file_content = " " + file_name + " "
27
+ else:
28
+ print(f"\n file `{file_name}` not found.")
29
+ return
30
+ else:
31
+ text_file_content = _increment_file_name(
32
+ text_file_content=text_file_content, file_name=file_name
33
+ )
30
34
  else:
31
35
  print("📂 reading /input directory... ")
32
36
  dir_files = "input"