ohmyscrapper 0.2.1__tar.gz → 0.3.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ohmyscrapper-0.2.1 → ohmyscrapper-0.3.4}/PKG-INFO +47 -27
- {ohmyscrapper-0.2.1 → ohmyscrapper-0.3.4}/README.md +43 -23
- {ohmyscrapper-0.2.1 → ohmyscrapper-0.3.4}/pyproject.toml +3 -3
- {ohmyscrapper-0.2.1 → ohmyscrapper-0.3.4}/src/ohmyscrapper/__init__.py +41 -15
- {ohmyscrapper-0.2.1 → ohmyscrapper-0.3.4}/src/ohmyscrapper/models/urls_manager.py +57 -16
- ohmyscrapper-0.3.4/src/ohmyscrapper/modules/classify_urls.py +27 -0
- ohmyscrapper-0.3.4/src/ohmyscrapper/modules/load_txt.py +94 -0
- {ohmyscrapper-0.2.1 → ohmyscrapper-0.3.4}/src/ohmyscrapper/modules/process_with_ai.py +63 -31
- {ohmyscrapper-0.2.1 → ohmyscrapper-0.3.4}/src/ohmyscrapper/modules/scrap_urls.py +80 -49
- {ohmyscrapper-0.2.1 → ohmyscrapper-0.3.4}/src/ohmyscrapper/modules/seed.py +1 -1
- {ohmyscrapper-0.2.1 → ohmyscrapper-0.3.4}/src/ohmyscrapper/modules/show.py +11 -4
- {ohmyscrapper-0.2.1 → ohmyscrapper-0.3.4}/src/ohmyscrapper/modules/untouch_all.py +1 -1
- ohmyscrapper-0.2.1/src/ohmyscrapper/modules/classify_urls.py +0 -23
- ohmyscrapper-0.2.1/src/ohmyscrapper/modules/load_txt.py +0 -32
- {ohmyscrapper-0.2.1 → ohmyscrapper-0.3.4}/src/ohmyscrapper/__main__.py +0 -0
- {ohmyscrapper-0.2.1 → ohmyscrapper-0.3.4}/src/ohmyscrapper/modules/merge_dbs.py +0 -0
- {ohmyscrapper-0.2.1 → ohmyscrapper-0.3.4}/src/ohmyscrapper/modules/sniff_url.py +0 -0
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: ohmyscrapper
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary:
|
|
5
|
-
Author: Cesar Cardoso
|
|
6
|
-
Author-email: Cesar Cardoso
|
|
3
|
+
Version: 0.3.4
|
|
4
|
+
Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
|
|
5
|
+
Author: Cesar Cardoso
|
|
6
|
+
Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
|
|
7
7
|
Requires-Dist: beautifulsoup4>=4.14.3
|
|
8
8
|
Requires-Dist: google-genai>=1.55.0
|
|
9
9
|
Requires-Dist: markdown>=3.10
|
|
@@ -16,39 +16,50 @@ Requires-Dist: urlextract>=1.9.0
|
|
|
16
16
|
Requires-Python: >=3.11
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
|
|
19
|
-
# OhMyScrapper - v0.
|
|
19
|
+
# 🐶 OhMyScrapper - v0.3.4
|
|
20
20
|
|
|
21
|
-
|
|
22
|
-
final
|
|
23
|
-
|
|
24
|
-
> This project is using [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer) by default.
|
|
21
|
+
OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
|
|
22
|
+
final report with general information about job positions.
|
|
25
23
|
|
|
26
24
|
## Scope
|
|
27
25
|
|
|
28
26
|
- Read texts;
|
|
29
|
-
- Extract
|
|
30
|
-
-
|
|
27
|
+
- Extract and load urls;
|
|
28
|
+
- Scrapes the urls looking for og:tags and titles;
|
|
29
|
+
- Export a list of links with relevant information;
|
|
31
30
|
|
|
32
31
|
## Installation
|
|
33
32
|
|
|
33
|
+
You can install directly in your `pip`:
|
|
34
|
+
```shell
|
|
35
|
+
pip install ohmyscrapper
|
|
36
|
+
```
|
|
37
|
+
|
|
34
38
|
I recomend to use the [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer), so you can just use the command bellow and everything is installed:
|
|
35
39
|
```shell
|
|
36
|
-
uv
|
|
40
|
+
uv add ohmyscrapper
|
|
41
|
+
uv run ohmyscrapper --version
|
|
37
42
|
```
|
|
38
43
|
|
|
44
|
+
But you can use everything as a tool, for example:
|
|
45
|
+
```shell
|
|
46
|
+
uvx ohmyscrapper --version
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
|
|
39
50
|
## How to use and test (development only)
|
|
40
51
|
|
|
41
52
|
OhMyScrapper works in 3 stages:
|
|
42
53
|
|
|
43
|
-
1. It collects and loads urls from a text
|
|
54
|
+
1. It collects and loads urls from a text in a database;
|
|
44
55
|
2. It scraps/access the collected urls and read what is relevant. If it finds new urls, they are collected as well;
|
|
45
56
|
3. Export a list of urls in CSV files;
|
|
46
57
|
|
|
47
58
|
You can do 3 stages with the command:
|
|
48
59
|
```shell
|
|
49
|
-
|
|
60
|
+
ohmyscrapper start
|
|
50
61
|
```
|
|
51
|
-
> Remember to add your text file in the folder `/input` with the name
|
|
62
|
+
> Remember to add your text file in the folder `/input` with the name that finishes with `.txt`!
|
|
52
63
|
|
|
53
64
|
You will find the exported files in the folder `/output` like this:
|
|
54
65
|
- `/output/report.csv`
|
|
@@ -60,23 +71,22 @@ You will find the exported files in the folder `/output` like this:
|
|
|
60
71
|
|
|
61
72
|
### BUT: if you want to do step by step, here it is:
|
|
62
73
|
|
|
63
|
-
First we load a text file you would like to look for urls
|
|
64
|
-
use the whatsapp history, but it works with any txt file.
|
|
74
|
+
First we load a text file you would like to look for urls. It it works with any txt file.
|
|
65
75
|
|
|
66
|
-
The default
|
|
67
|
-
the command `load`:
|
|
76
|
+
The default folder is `/input`. Put one or more text (finished with `.txt`) files
|
|
77
|
+
in this folder and use the command `load`:
|
|
68
78
|
```shell
|
|
69
|
-
|
|
79
|
+
ohmyscrapper load
|
|
70
80
|
```
|
|
71
|
-
or, if you have another file, just use the argument `-file` like this:
|
|
81
|
+
or, if you have another file in a different folder, just use the argument `-file` like this:
|
|
72
82
|
```shell
|
|
73
|
-
|
|
83
|
+
ohmyscrapper load -file=my-text-file.txt
|
|
74
84
|
```
|
|
75
85
|
That will create a database if it doesn't exist and store every url the oh-my-scrapper
|
|
76
86
|
find. After that, let's scrap the urls with the command `scrap-urls`:
|
|
77
87
|
|
|
78
88
|
```shell
|
|
79
|
-
|
|
89
|
+
ohmyscrapper scrap-urls --recursive --ignore-type
|
|
80
90
|
```
|
|
81
91
|
|
|
82
92
|
That will scrap only the linkedin urls we are interested in. For now they are:
|
|
@@ -88,23 +98,33 @@ That will scrap only the linkedin urls we are interested in. For now they are:
|
|
|
88
98
|
|
|
89
99
|
But we can use every other one generically using the argument `--ignore-type`:
|
|
90
100
|
```shell
|
|
91
|
-
|
|
101
|
+
ohmyscrapper scrap-urls --ignore-type
|
|
92
102
|
```
|
|
93
103
|
|
|
94
104
|
And we can ask to make it recursively adding the argument `--recursive`:
|
|
95
105
|
```shell
|
|
96
|
-
|
|
106
|
+
ohmyscrapper scrap-urls --recursive
|
|
97
107
|
```
|
|
98
108
|
> !!! important: we are not sure about blocks we can have for excess of requests
|
|
99
109
|
|
|
100
110
|
And we can finally export with the command:
|
|
101
111
|
```shell
|
|
102
|
-
|
|
112
|
+
ohmyscrapper export
|
|
113
|
+
ohmyscrapper export --file=output/urls-simplified.csv --simplify
|
|
114
|
+
ohmyscrapper report
|
|
103
115
|
```
|
|
104
116
|
|
|
105
117
|
|
|
106
118
|
That's the basic usage!
|
|
107
119
|
But you can understand more using the help:
|
|
108
120
|
```shell
|
|
109
|
-
|
|
121
|
+
ohmyscrapper --help
|
|
110
122
|
```
|
|
123
|
+
|
|
124
|
+
## See Also
|
|
125
|
+
|
|
126
|
+
- Github: https://github.com/bouli/ohmyscrapper
|
|
127
|
+
- PyPI: https://pypi.org/project/ohmyscrapper/
|
|
128
|
+
|
|
129
|
+
## License
|
|
130
|
+
This package is distributed under the [MIT license](https://opensource.org/license/MIT).
|
|
@@ -1,36 +1,47 @@
|
|
|
1
|
-
# OhMyScrapper - v0.
|
|
1
|
+
# 🐶 OhMyScrapper - v0.3.4
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
final
|
|
5
|
-
|
|
6
|
-
> This project is using [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer) by default.
|
|
3
|
+
OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
|
|
4
|
+
final report with general information about job positions.
|
|
7
5
|
|
|
8
6
|
## Scope
|
|
9
7
|
|
|
10
8
|
- Read texts;
|
|
11
|
-
- Extract
|
|
12
|
-
-
|
|
9
|
+
- Extract and load urls;
|
|
10
|
+
- Scrapes the urls looking for og:tags and titles;
|
|
11
|
+
- Export a list of links with relevant information;
|
|
13
12
|
|
|
14
13
|
## Installation
|
|
15
14
|
|
|
15
|
+
You can install directly in your `pip`:
|
|
16
|
+
```shell
|
|
17
|
+
pip install ohmyscrapper
|
|
18
|
+
```
|
|
19
|
+
|
|
16
20
|
I recomend to use the [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer), so you can just use the command bellow and everything is installed:
|
|
17
21
|
```shell
|
|
18
|
-
uv
|
|
22
|
+
uv add ohmyscrapper
|
|
23
|
+
uv run ohmyscrapper --version
|
|
19
24
|
```
|
|
20
25
|
|
|
26
|
+
But you can use everything as a tool, for example:
|
|
27
|
+
```shell
|
|
28
|
+
uvx ohmyscrapper --version
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
|
|
21
32
|
## How to use and test (development only)
|
|
22
33
|
|
|
23
34
|
OhMyScrapper works in 3 stages:
|
|
24
35
|
|
|
25
|
-
1. It collects and loads urls from a text
|
|
36
|
+
1. It collects and loads urls from a text in a database;
|
|
26
37
|
2. It scraps/access the collected urls and read what is relevant. If it finds new urls, they are collected as well;
|
|
27
38
|
3. Export a list of urls in CSV files;
|
|
28
39
|
|
|
29
40
|
You can do 3 stages with the command:
|
|
30
41
|
```shell
|
|
31
|
-
|
|
42
|
+
ohmyscrapper start
|
|
32
43
|
```
|
|
33
|
-
> Remember to add your text file in the folder `/input` with the name
|
|
44
|
+
> Remember to add your text file in the folder `/input` with the name that finishes with `.txt`!
|
|
34
45
|
|
|
35
46
|
You will find the exported files in the folder `/output` like this:
|
|
36
47
|
- `/output/report.csv`
|
|
@@ -42,23 +53,22 @@ You will find the exported files in the folder `/output` like this:
|
|
|
42
53
|
|
|
43
54
|
### BUT: if you want to do step by step, here it is:
|
|
44
55
|
|
|
45
|
-
First we load a text file you would like to look for urls
|
|
46
|
-
use the whatsapp history, but it works with any txt file.
|
|
56
|
+
First we load a text file you would like to look for urls. It it works with any txt file.
|
|
47
57
|
|
|
48
|
-
The default
|
|
49
|
-
the command `load`:
|
|
58
|
+
The default folder is `/input`. Put one or more text (finished with `.txt`) files
|
|
59
|
+
in this folder and use the command `load`:
|
|
50
60
|
```shell
|
|
51
|
-
|
|
61
|
+
ohmyscrapper load
|
|
52
62
|
```
|
|
53
|
-
or, if you have another file, just use the argument `-file` like this:
|
|
63
|
+
or, if you have another file in a different folder, just use the argument `-file` like this:
|
|
54
64
|
```shell
|
|
55
|
-
|
|
65
|
+
ohmyscrapper load -file=my-text-file.txt
|
|
56
66
|
```
|
|
57
67
|
That will create a database if it doesn't exist and store every url the oh-my-scrapper
|
|
58
68
|
find. After that, let's scrap the urls with the command `scrap-urls`:
|
|
59
69
|
|
|
60
70
|
```shell
|
|
61
|
-
|
|
71
|
+
ohmyscrapper scrap-urls --recursive --ignore-type
|
|
62
72
|
```
|
|
63
73
|
|
|
64
74
|
That will scrap only the linkedin urls we are interested in. For now they are:
|
|
@@ -70,23 +80,33 @@ That will scrap only the linkedin urls we are interested in. For now they are:
|
|
|
70
80
|
|
|
71
81
|
But we can use every other one generically using the argument `--ignore-type`:
|
|
72
82
|
```shell
|
|
73
|
-
|
|
83
|
+
ohmyscrapper scrap-urls --ignore-type
|
|
74
84
|
```
|
|
75
85
|
|
|
76
86
|
And we can ask to make it recursively adding the argument `--recursive`:
|
|
77
87
|
```shell
|
|
78
|
-
|
|
88
|
+
ohmyscrapper scrap-urls --recursive
|
|
79
89
|
```
|
|
80
90
|
> !!! important: we are not sure about blocks we can have for excess of requests
|
|
81
91
|
|
|
82
92
|
And we can finally export with the command:
|
|
83
93
|
```shell
|
|
84
|
-
|
|
94
|
+
ohmyscrapper export
|
|
95
|
+
ohmyscrapper export --file=output/urls-simplified.csv --simplify
|
|
96
|
+
ohmyscrapper report
|
|
85
97
|
```
|
|
86
98
|
|
|
87
99
|
|
|
88
100
|
That's the basic usage!
|
|
89
101
|
But you can understand more using the help:
|
|
90
102
|
```shell
|
|
91
|
-
|
|
103
|
+
ohmyscrapper --help
|
|
92
104
|
```
|
|
105
|
+
|
|
106
|
+
## See Also
|
|
107
|
+
|
|
108
|
+
- Github: https://github.com/bouli/ohmyscrapper
|
|
109
|
+
- PyPI: https://pypi.org/project/ohmyscrapper/
|
|
110
|
+
|
|
111
|
+
## License
|
|
112
|
+
This package is distributed under the [MIT license](https://opensource.org/license/MIT).
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "ohmyscrapper"
|
|
3
|
-
version = "0.
|
|
4
|
-
description = "
|
|
3
|
+
version = "0.3.4"
|
|
4
|
+
description = "OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
7
|
-
{ name = "Cesar Cardoso
|
|
7
|
+
{ name = "Cesar Cardoso", email = "hello@cesarcardoso.cc" }
|
|
8
8
|
]
|
|
9
9
|
requires-python = ">=3.11"
|
|
10
10
|
dependencies = [
|
|
@@ -19,13 +19,22 @@ from ohmyscrapper.modules.merge_dbs import merge_dbs
|
|
|
19
19
|
|
|
20
20
|
def main():
|
|
21
21
|
parser = argparse.ArgumentParser(prog="ohmyscrapper")
|
|
22
|
-
parser.add_argument("--version", action="version", version="%(prog)s v0.
|
|
22
|
+
parser.add_argument("--version", action="version", version="%(prog)s v0.3.4")
|
|
23
23
|
|
|
24
24
|
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
25
|
+
start_parser = subparsers.add_parser(
|
|
26
|
+
"start",
|
|
27
|
+
help="Make the entire process of 📦 loading, 🐶 scraping and 📜🖋️ exporting with the default configuration.",
|
|
28
|
+
)
|
|
25
29
|
|
|
26
|
-
|
|
27
|
-
"
|
|
30
|
+
start_parser.add_argument(
|
|
31
|
+
"--ai",
|
|
32
|
+
default=False,
|
|
33
|
+
help="Make the entire process of loading, processing, reprocessing with AI and exporting with the default configuration.",
|
|
34
|
+
action="store_true",
|
|
28
35
|
)
|
|
36
|
+
|
|
37
|
+
ai_process_parser = subparsers.add_parser("ai", help="Process with AI.")
|
|
29
38
|
ai_process_parser.add_argument(
|
|
30
39
|
"--history", default=False, help="Reprocess ai history", action="store_true"
|
|
31
40
|
)
|
|
@@ -44,12 +53,13 @@ def main():
|
|
|
44
53
|
"--recursive", default=False, help="Run in recursive mode", action="store_true"
|
|
45
54
|
)
|
|
46
55
|
|
|
47
|
-
load_txt_parser = subparsers.add_parser("load", help="Load txt file")
|
|
56
|
+
load_txt_parser = subparsers.add_parser("load", help="📦 Load txt file")
|
|
57
|
+
load_txt_parser.add_argument("-file", default=None, help="File path.")
|
|
48
58
|
load_txt_parser.add_argument(
|
|
49
|
-
"
|
|
59
|
+
"--verbose", default=False, help="Run in verbose mode", action="store_true"
|
|
50
60
|
)
|
|
51
61
|
|
|
52
|
-
scrap_urls_parser = subparsers.add_parser("scrap-urls", help="Scrap urls")
|
|
62
|
+
scrap_urls_parser = subparsers.add_parser("scrap-urls", help="🐶 Scrap urls")
|
|
53
63
|
scrap_urls_parser.add_argument(
|
|
54
64
|
"--recursive", default=False, help="Run in recursive mode", action="store_true"
|
|
55
65
|
)
|
|
@@ -62,8 +72,11 @@ def main():
|
|
|
62
72
|
scrap_urls_parser.add_argument(
|
|
63
73
|
"--only-parents", default=False, help="Only parents urls", action="store_true"
|
|
64
74
|
)
|
|
75
|
+
scrap_urls_parser.add_argument(
|
|
76
|
+
"--verbose", default=False, help="Run in verbose mode", action="store_true"
|
|
77
|
+
)
|
|
65
78
|
|
|
66
|
-
sniff_url_parser = subparsers.add_parser("sniff-url", help="Check url")
|
|
79
|
+
sniff_url_parser = subparsers.add_parser("sniff-url", help="🐕 Sniff/Check url")
|
|
67
80
|
sniff_url_parser.add_argument(
|
|
68
81
|
"url", default="https://cesarcardoso.cc/", help="Url to sniff"
|
|
69
82
|
)
|
|
@@ -75,7 +88,7 @@ def main():
|
|
|
75
88
|
show_urls_parser.add_argument("--limit", default=0, help="Limit of lines to show")
|
|
76
89
|
show_urls_parser.add_argument("-url", default="", help="Url to show")
|
|
77
90
|
|
|
78
|
-
export_parser = subparsers.add_parser("export", help="Export urls to csv.")
|
|
91
|
+
export_parser = subparsers.add_parser("export", help="📊🖋️ Export urls to csv.")
|
|
79
92
|
export_parser.add_argument("--limit", default=0, help="Limit of lines to export")
|
|
80
93
|
export_parser.add_argument(
|
|
81
94
|
"--file",
|
|
@@ -89,14 +102,11 @@ def main():
|
|
|
89
102
|
action="store_true",
|
|
90
103
|
)
|
|
91
104
|
|
|
92
|
-
report_parser = subparsers.add_parser(
|
|
105
|
+
report_parser = subparsers.add_parser(
|
|
106
|
+
"report", help="📜🖋️ Export urls report to csv."
|
|
107
|
+
)
|
|
93
108
|
merge_parser = subparsers.add_parser("merge_dbs", help="Merge databases.")
|
|
94
109
|
|
|
95
|
-
# TODO: What is that?
|
|
96
|
-
# seed_parser.set_defaults(func=seed)
|
|
97
|
-
# classify_urls_parser.set_defaults(func=classify_urls)
|
|
98
|
-
# load_txt_parser.set_defaults(func=load_txt)
|
|
99
|
-
|
|
100
110
|
args = parser.parse_args()
|
|
101
111
|
|
|
102
112
|
if args.command == "classify-urls":
|
|
@@ -104,7 +114,7 @@ def main():
|
|
|
104
114
|
return
|
|
105
115
|
|
|
106
116
|
if args.command == "load":
|
|
107
|
-
load_txt(args.file)
|
|
117
|
+
load_txt(file_name=args.file, verbose=args.verbose)
|
|
108
118
|
return
|
|
109
119
|
|
|
110
120
|
if args.command == "seed":
|
|
@@ -125,6 +135,7 @@ def main():
|
|
|
125
135
|
ignore_valid_prefix=args.ignore_type,
|
|
126
136
|
randomize=args.randomize,
|
|
127
137
|
only_parents=args.only_parents,
|
|
138
|
+
verbose=args.verbose,
|
|
128
139
|
)
|
|
129
140
|
return
|
|
130
141
|
|
|
@@ -157,6 +168,21 @@ def main():
|
|
|
157
168
|
merge_dbs()
|
|
158
169
|
return
|
|
159
170
|
|
|
171
|
+
if args.command == "start":
|
|
172
|
+
load_txt()
|
|
173
|
+
scrap_urls(
|
|
174
|
+
recursive=True,
|
|
175
|
+
ignore_valid_prefix=True,
|
|
176
|
+
randomize=False,
|
|
177
|
+
only_parents=False,
|
|
178
|
+
)
|
|
179
|
+
if args.ai:
|
|
180
|
+
process_with_ai()
|
|
181
|
+
export_urls()
|
|
182
|
+
export_urls(csv_file="output/urls-simplified.csv", simplify=True)
|
|
183
|
+
export_report()
|
|
184
|
+
return
|
|
185
|
+
|
|
160
186
|
|
|
161
187
|
if __name__ == "__main__":
|
|
162
188
|
main()
|