ohmyscrapper 0.3.4__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ohmyscrapper-0.3.4 → ohmyscrapper-0.4.0}/PKG-INFO +10 -4
- {ohmyscrapper-0.3.4 → ohmyscrapper-0.4.0}/README.md +9 -3
- {ohmyscrapper-0.3.4 → ohmyscrapper-0.4.0}/pyproject.toml +1 -1
- {ohmyscrapper-0.3.4 → ohmyscrapper-0.4.0}/src/ohmyscrapper/__init__.py +3 -3
- {ohmyscrapper-0.3.4 → ohmyscrapper-0.4.0}/src/ohmyscrapper/modules/load_txt.py +9 -5
- {ohmyscrapper-0.3.4 → ohmyscrapper-0.4.0}/src/ohmyscrapper/__main__.py +0 -0
- {ohmyscrapper-0.3.4 → ohmyscrapper-0.4.0}/src/ohmyscrapper/models/urls_manager.py +0 -0
- {ohmyscrapper-0.3.4 → ohmyscrapper-0.4.0}/src/ohmyscrapper/modules/classify_urls.py +0 -0
- {ohmyscrapper-0.3.4 → ohmyscrapper-0.4.0}/src/ohmyscrapper/modules/merge_dbs.py +0 -0
- {ohmyscrapper-0.3.4 → ohmyscrapper-0.4.0}/src/ohmyscrapper/modules/process_with_ai.py +0 -0
- {ohmyscrapper-0.3.4 → ohmyscrapper-0.4.0}/src/ohmyscrapper/modules/scrap_urls.py +0 -0
- {ohmyscrapper-0.3.4 → ohmyscrapper-0.4.0}/src/ohmyscrapper/modules/seed.py +0 -0
- {ohmyscrapper-0.3.4 → ohmyscrapper-0.4.0}/src/ohmyscrapper/modules/show.py +0 -0
- {ohmyscrapper-0.3.4 → ohmyscrapper-0.4.0}/src/ohmyscrapper/modules/sniff_url.py +0 -0
- {ohmyscrapper-0.3.4 → ohmyscrapper-0.4.0}/src/ohmyscrapper/modules/untouch_all.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: ohmyscrapper
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
|
|
5
5
|
Author: Cesar Cardoso
|
|
6
6
|
Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
|
|
@@ -16,7 +16,7 @@ Requires-Dist: urlextract>=1.9.0
|
|
|
16
16
|
Requires-Python: >=3.11
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
|
|
19
|
-
# 🐶 OhMyScrapper - v0.
|
|
19
|
+
# 🐶 OhMyScrapper - v0.4.0
|
|
20
20
|
|
|
21
21
|
OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
|
|
22
22
|
final report with general information about job positions.
|
|
@@ -78,10 +78,16 @@ in this folder and use the command `load`:
|
|
|
78
78
|
```shell
|
|
79
79
|
ohmyscrapper load
|
|
80
80
|
```
|
|
81
|
-
or, if you have another file in a different folder, just use the argument `-
|
|
81
|
+
or, if you have another file in a different folder, just use the argument `-input` like this:
|
|
82
82
|
```shell
|
|
83
|
-
ohmyscrapper load -
|
|
83
|
+
ohmyscrapper load -input=my-text-file.txt
|
|
84
84
|
```
|
|
85
|
+
In this case, you can add an url directly to the database, like this:
|
|
86
|
+
```shell
|
|
87
|
+
ohmyscrapper load -input=https://cesarcardoso.cc/
|
|
88
|
+
```
|
|
89
|
+
That will append the last url in the database to be scraped.
|
|
90
|
+
|
|
85
91
|
That will create a database if it doesn't exist and store every url the oh-my-scrapper
|
|
86
92
|
find. After that, let's scrap the urls with the command `scrap-urls`:
|
|
87
93
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# 🐶 OhMyScrapper - v0.
|
|
1
|
+
# 🐶 OhMyScrapper - v0.4.0
|
|
2
2
|
|
|
3
3
|
OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
|
|
4
4
|
final report with general information about job positions.
|
|
@@ -60,10 +60,16 @@ in this folder and use the command `load`:
|
|
|
60
60
|
```shell
|
|
61
61
|
ohmyscrapper load
|
|
62
62
|
```
|
|
63
|
-
or, if you have another file in a different folder, just use the argument `-
|
|
63
|
+
or, if you have another file in a different folder, just use the argument `-input` like this:
|
|
64
64
|
```shell
|
|
65
|
-
ohmyscrapper load -
|
|
65
|
+
ohmyscrapper load -input=my-text-file.txt
|
|
66
66
|
```
|
|
67
|
+
In this case, you can add an url directly to the database, like this:
|
|
68
|
+
```shell
|
|
69
|
+
ohmyscrapper load -input=https://cesarcardoso.cc/
|
|
70
|
+
```
|
|
71
|
+
That will append the last url in the database to be scraped.
|
|
72
|
+
|
|
67
73
|
That will create a database if it doesn't exist and store every url the oh-my-scrapper
|
|
68
74
|
find. After that, let's scrap the urls with the command `scrap-urls`:
|
|
69
75
|
|
|
@@ -19,7 +19,7 @@ from ohmyscrapper.modules.merge_dbs import merge_dbs
|
|
|
19
19
|
|
|
20
20
|
def main():
|
|
21
21
|
parser = argparse.ArgumentParser(prog="ohmyscrapper")
|
|
22
|
-
parser.add_argument("--version", action="version", version="%(prog)s v0.
|
|
22
|
+
parser.add_argument("--version", action="version", version="%(prog)s v0.4.0")
|
|
23
23
|
|
|
24
24
|
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
25
25
|
start_parser = subparsers.add_parser(
|
|
@@ -54,7 +54,7 @@ def main():
|
|
|
54
54
|
)
|
|
55
55
|
|
|
56
56
|
load_txt_parser = subparsers.add_parser("load", help="📦 Load txt file")
|
|
57
|
-
load_txt_parser.add_argument("-
|
|
57
|
+
load_txt_parser.add_argument("-input", default=None, help="File path or url.")
|
|
58
58
|
load_txt_parser.add_argument(
|
|
59
59
|
"--verbose", default=False, help="Run in verbose mode", action="store_true"
|
|
60
60
|
)
|
|
@@ -114,7 +114,7 @@ def main():
|
|
|
114
114
|
return
|
|
115
115
|
|
|
116
116
|
if args.command == "load":
|
|
117
|
-
load_txt(file_name=args.
|
|
117
|
+
load_txt(file_name=args.input, verbose=args.verbose)
|
|
118
118
|
return
|
|
119
119
|
|
|
120
120
|
if args.command == "seed":
|
|
@@ -22,11 +22,15 @@ def load_txt(file_name=None, verbose=False):
|
|
|
22
22
|
if file_name is not None:
|
|
23
23
|
print(f"📖 reading file `{file_name}`... ")
|
|
24
24
|
if not os.path.exists(file_name):
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
25
|
+
if file_name.startswith("https://") or file_name.startswith("http://"):
|
|
26
|
+
text_file_content = " " + file_name + " "
|
|
27
|
+
else:
|
|
28
|
+
print(f"\n file `{file_name}` not found.")
|
|
29
|
+
return
|
|
30
|
+
else:
|
|
31
|
+
text_file_content = _increment_file_name(
|
|
32
|
+
text_file_content=text_file_content, file_name=file_name
|
|
33
|
+
)
|
|
30
34
|
else:
|
|
31
35
|
print("📂 reading /input directory... ")
|
|
32
36
|
dir_files = "input"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|