ohmyscrapper 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ohmyscrapper/__init__.py +19 -2
- {ohmyscrapper-0.2.1.dist-info → ohmyscrapper-0.2.3.dist-info}/METADATA +35 -15
- {ohmyscrapper-0.2.1.dist-info → ohmyscrapper-0.2.3.dist-info}/RECORD +5 -5
- {ohmyscrapper-0.2.1.dist-info → ohmyscrapper-0.2.3.dist-info}/WHEEL +0 -0
- {ohmyscrapper-0.2.1.dist-info → ohmyscrapper-0.2.3.dist-info}/entry_points.txt +0 -0
ohmyscrapper/__init__.py
CHANGED
|
@@ -19,12 +19,19 @@ from ohmyscrapper.modules.merge_dbs import merge_dbs
|
|
|
19
19
|
|
|
20
20
|
def main():
|
|
21
21
|
parser = argparse.ArgumentParser(prog="ohmyscrapper")
|
|
22
|
-
parser.add_argument("--version", action="version", version="%(prog)s v0.2.
|
|
22
|
+
parser.add_argument("--version", action="version", version="%(prog)s v0.2.3")
|
|
23
23
|
|
|
24
24
|
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
25
|
+
start_parser = subparsers.add_parser(
|
|
26
|
+
"start", help="Make the entire process of loading, processing and exporting with the default configuration."
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
start_parser.add_argument(
|
|
30
|
+
"--ai", default=False, help="Make the entire process of loading, processing, reprocessing with AI and exporting with the default configuration.", action="store_true"
|
|
31
|
+
)
|
|
25
32
|
|
|
26
33
|
ai_process_parser = subparsers.add_parser(
|
|
27
|
-
"
|
|
34
|
+
"ai", help="Process with AI."
|
|
28
35
|
)
|
|
29
36
|
ai_process_parser.add_argument(
|
|
30
37
|
"--history", default=False, help="Reprocess ai history", action="store_true"
|
|
@@ -157,6 +164,16 @@ def main():
|
|
|
157
164
|
merge_dbs()
|
|
158
165
|
return
|
|
159
166
|
|
|
167
|
+
if args.command == "start":
|
|
168
|
+
load_txt()
|
|
169
|
+
scrap_urls(recursive=True,ignore_valid_prefix=True,randomize=False,only_parents=False)
|
|
170
|
+
if args.ai:
|
|
171
|
+
process_with_ai()
|
|
172
|
+
export_urls()
|
|
173
|
+
export_urls(csv_file="output/urls-simplified.csv", simplify=True)
|
|
174
|
+
export_report()
|
|
175
|
+
return
|
|
176
|
+
|
|
160
177
|
|
|
161
178
|
if __name__ == "__main__":
|
|
162
179
|
main()
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: ohmyscrapper
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: This project aims to create a text-based scraper containing links to create a final PDF with general information about job openings.
|
|
5
|
-
Author: Cesar Cardoso
|
|
6
|
-
Author-email: Cesar Cardoso
|
|
5
|
+
Author: Cesar Cardoso
|
|
6
|
+
Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
|
|
7
7
|
Requires-Dist: beautifulsoup4>=4.14.3
|
|
8
8
|
Requires-Dist: google-genai>=1.55.0
|
|
9
9
|
Requires-Dist: markdown>=3.10
|
|
@@ -16,13 +16,11 @@ Requires-Dist: urlextract>=1.9.0
|
|
|
16
16
|
Requires-Python: >=3.11
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
|
|
19
|
-
# OhMyScrapper - v0.2.
|
|
19
|
+
# OhMyScrapper - v0.2.3
|
|
20
20
|
|
|
21
21
|
This project aims to create a text-based scraper containing links to create a
|
|
22
22
|
final PDF with general information about job openings.
|
|
23
23
|
|
|
24
|
-
> This project is using [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer) by default.
|
|
25
|
-
|
|
26
24
|
## Scope
|
|
27
25
|
|
|
28
26
|
- Read texts;
|
|
@@ -31,11 +29,23 @@ final PDF with general information about job openings.
|
|
|
31
29
|
|
|
32
30
|
## Installation
|
|
33
31
|
|
|
32
|
+
You can install directly in your `pip`:
|
|
33
|
+
```shell
|
|
34
|
+
pip install ohmyscrapper
|
|
35
|
+
```
|
|
36
|
+
|
|
34
37
|
I recomend to use the [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer), so you can just use the command bellow and everything is installed:
|
|
35
38
|
```shell
|
|
36
|
-
uv
|
|
39
|
+
uv add ohmyscrapper
|
|
40
|
+
uv run ohmyscrapper --version
|
|
37
41
|
```
|
|
38
42
|
|
|
43
|
+
But you can use everything as a tool, for example:
|
|
44
|
+
```shell
|
|
45
|
+
uvx ohmyscrapper --version
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
|
|
39
49
|
## How to use and test (development only)
|
|
40
50
|
|
|
41
51
|
OhMyScrapper works in 3 stages:
|
|
@@ -46,7 +56,7 @@ OhMyScrapper works in 3 stages:
|
|
|
46
56
|
|
|
47
57
|
You can do 3 stages with the command:
|
|
48
58
|
```shell
|
|
49
|
-
|
|
59
|
+
ohmyscrapper start
|
|
50
60
|
```
|
|
51
61
|
> Remember to add your text file in the folder `/input` with the name `_chat.txt`!
|
|
52
62
|
|
|
@@ -66,17 +76,17 @@ use the whatsapp history, but it works with any txt file.
|
|
|
66
76
|
The default file is `input/_chat.txt`. If you have the default file you just use
|
|
67
77
|
the command `load`:
|
|
68
78
|
```shell
|
|
69
|
-
|
|
79
|
+
ohmyscrapper load
|
|
70
80
|
```
|
|
71
81
|
or, if you have another file, just use the argument `-file` like this:
|
|
72
82
|
```shell
|
|
73
|
-
|
|
83
|
+
ohmyscrapper load -file=my-text-file.txt
|
|
74
84
|
```
|
|
75
85
|
That will create a database if it doesn't exist and store every url the oh-my-scrapper
|
|
76
86
|
find. After that, let's scrap the urls with the command `scrap-urls`:
|
|
77
87
|
|
|
78
88
|
```shell
|
|
79
|
-
|
|
89
|
+
ohmyscrapper scrap-urls --recursive --ignore-type
|
|
80
90
|
```
|
|
81
91
|
|
|
82
92
|
That will scrap only the linkedin urls we are interested in. For now they are:
|
|
@@ -88,23 +98,33 @@ That will scrap only the linkedin urls we are interested in. For now they are:
|
|
|
88
98
|
|
|
89
99
|
But we can use every other one generically using the argument `--ignore-type`:
|
|
90
100
|
```shell
|
|
91
|
-
|
|
101
|
+
ohmyscrapper scrap-urls --ignore-type
|
|
92
102
|
```
|
|
93
103
|
|
|
94
104
|
And we can ask to make it recursively adding the argument `--recursive`:
|
|
95
105
|
```shell
|
|
96
|
-
|
|
106
|
+
ohmyscrapper scrap-urls --recursive
|
|
97
107
|
```
|
|
98
108
|
> !!! important: we are not sure about blocks we can have for excess of requests
|
|
99
109
|
|
|
100
110
|
And we can finally export with the command:
|
|
101
111
|
```shell
|
|
102
|
-
|
|
112
|
+
ohmyscrapper export
|
|
113
|
+
ohmyscrapper export --file=output/urls-simplified.csv --simplify
|
|
114
|
+
ohmyscrapper report
|
|
103
115
|
```
|
|
104
116
|
|
|
105
117
|
|
|
106
118
|
That's the basic usage!
|
|
107
119
|
But you can understand more using the help:
|
|
108
120
|
```shell
|
|
109
|
-
|
|
121
|
+
ohmyscrapper --help
|
|
110
122
|
```
|
|
123
|
+
|
|
124
|
+
## See Also
|
|
125
|
+
|
|
126
|
+
- Github: https://github.com/bouli/ohmyscrapper
|
|
127
|
+
- PyPI: https://pypi.org/project/ohmyscrapper/
|
|
128
|
+
|
|
129
|
+
## License
|
|
130
|
+
This package is distributed under the [MIT license](https://opensource.org/license/MIT).
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
ohmyscrapper/__init__.py,sha256=
|
|
1
|
+
ohmyscrapper/__init__.py,sha256=6k-fyuKkTefy5lJiRJFDP7wfkGhYRC8qgdfYlNB_3sk,5841
|
|
2
2
|
ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
|
|
3
3
|
ohmyscrapper/models/urls_manager.py,sha256=xKql_xdwfRwgpMyriuIrZ0Srz4gYQGMfWClEWpGRtNE,11183
|
|
4
4
|
ohmyscrapper/modules/classify_urls.py,sha256=eyHtTHDZp2pGmYw_X-7LrbeVOgDPcRQdhu0oEuwQtKA,743
|
|
@@ -10,7 +10,7 @@ ohmyscrapper/modules/seed.py,sha256=KeTSbmTdNTkVCtzk9iQmeuEqB0kG-rTZJb2a1WdROL4,
|
|
|
10
10
|
ohmyscrapper/modules/show.py,sha256=u0L9uxgU8Xt_-myA3r7byuOmnX_-2gkpTtXWkXon1ns,3572
|
|
11
11
|
ohmyscrapper/modules/sniff_url.py,sha256=jQDc7aSimuOOedw2fSXZlf6_o0OqQHOr6NsWb4n0XgI,2720
|
|
12
12
|
ohmyscrapper/modules/untouch_all.py,sha256=E1U9e3sOG7suzc8ZTWcYiQQo9mPmLJ0piXdXUjFLEd4,162
|
|
13
|
-
ohmyscrapper-0.2.
|
|
14
|
-
ohmyscrapper-0.2.
|
|
15
|
-
ohmyscrapper-0.2.
|
|
16
|
-
ohmyscrapper-0.2.
|
|
13
|
+
ohmyscrapper-0.2.3.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
|
|
14
|
+
ohmyscrapper-0.2.3.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
|
|
15
|
+
ohmyscrapper-0.2.3.dist-info/METADATA,sha256=uwthvf7vwhb6H14KdbPLibzp1c3PaVmcJIYePmV8cRc,3832
|
|
16
|
+
ohmyscrapper-0.2.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|