ohmyscrapper 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,109 @@
1
+ Metadata-Version: 2.3
2
+ Name: ohmyscrapper
3
+ Version: 0.1.1
4
+ Summary: This project aims to create a text-based scraper containing links to create a final PDF with general information about job openings.
5
+ Author: Cesar Cardoso gh@bouli
6
+ Author-email: Cesar Cardoso gh@bouli <hello@cesarcardoso.cc>
7
+ Requires-Dist: beautifulsoup4>=4.14.3
8
+ Requires-Dist: google-genai>=1.55.0
9
+ Requires-Dist: pandas>=2.3.3
10
+ Requires-Dist: python-dotenv>=1.2.1
11
+ Requires-Dist: pyyaml>=6.0.3
12
+ Requires-Dist: requests>=2.32.5
13
+ Requires-Dist: rich>=14.2.0
14
+ Requires-Dist: urlextract>=1.9.0
15
+ Requires-Python: >=3.11
16
+ Description-Content-Type: text/markdown
17
+
18
+ # OhMyScrapper - v0.1.1
19
+
20
+ This project aims to create a text-based scraper containing links to create a
21
+ final PDF with general information about job openings.
22
+
23
+ > This project is using [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer) by default.
24
+
25
+ ## Scope
26
+
27
+ - Read texts;
28
+ - Extract links;
29
+ - Use meta og:tags to extract information;
30
+
31
+ ## Installation
32
+
33
+ I recomend to use the [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer), so you can just use the command bellow and everything is installed:
34
+ ```shell
35
+ uv sync
36
+ ```
37
+
38
+ ## How to use and test (development only)
39
+
40
+ OhMyScrapper works in 3 stages:
41
+
42
+ 1. It collects and loads urls from a text (by default `input/_chat.txt`) in a database;
43
+ 2. It scraps/access the collected urls and read what is relevant. If it finds new urls, they are collected as well;
44
+ 3. Export a list of urls in CSV files;
45
+
46
+ You can do 3 stages with the command:
47
+ ```shell
48
+ make start
49
+ ```
50
+ > Remember to add your text file in the folder `/input` with the name `_chat.txt`!
51
+
52
+ You will find the exported files in the folder `/output` like this:
53
+ - `/output/report.csv`
54
+ - `/output/report.csv-preview.html`
55
+ - `/output/urls-simplified.csv`
56
+ - `/output/urls-simplified.csv-preview.html`
57
+ - `/output/urls.csv`
58
+ - `/output/urls.csv-preview.html`
59
+
60
+ ### BUT: if you want to do step by step, here it is:
61
+
62
+ First we load a text file you would like to look for urls, the idea here is to
63
+ use the whatsapp history, but it works with any txt file.
64
+
65
+ The default file is `input/_chat.txt`. If you have the default file you just use
66
+ the command `load`:
67
+ ```shell
68
+ make load
69
+ ```
70
+ or, if you have another file, just use the argument `-file` like this:
71
+ ```shell
72
+ uv run main.py load -file=my-text-file.txt
73
+ ```
74
+ That will create a database if it doesn't exist and store every url the oh-my-scrapper
75
+ find. After that, let's scrap the urls with the command `scrap-urls`:
76
+
77
+ ```shell
78
+ make scrap-urls
79
+ ```
80
+
81
+ That will scrap only the linkedin urls we are interested in. For now they are:
82
+ - linkedin_post: https://%.linkedin.com/posts/%
83
+ - linkedin_redirect: https://lnkd.in/%
84
+ - linkedin_job: https://%.linkedin.com/jobs/view/%
85
+ - linkedin_feed" https://%.linkedin.com/feed/%
86
+ - linkedin_company: https://%.linkedin.com/company/%
87
+
88
+ But we can use every other one generically using the argument `--ignore-type`:
89
+ ```shell
90
+ uv run main.py scrap-urls --ignore-type
91
+ ```
92
+
93
+ And we can ask to make it recursively adding the argument `--recursive`:
94
+ ```shell
95
+ uv run main.py scrap-urls --recursive
96
+ ```
97
+ > !!! important: we are not sure about blocks we can have for excess of requests
98
+
99
+ And we can finally export with the command:
100
+ ```shell
101
+ make export
102
+ ```
103
+
104
+
105
+ That's the basic usage!
106
+ But you can understand more using the help:
107
+ ```shell
108
+ uv run main.py --help
109
+ ```
@@ -0,0 +1,92 @@
1
+ # OhMyScrapper - v0.1.1
2
+
3
+ This project aims to create a text-based scraper containing links to create a
4
+ final PDF with general information about job openings.
5
+
6
+ > This project is using [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer) by default.
7
+
8
+ ## Scope
9
+
10
+ - Read texts;
11
+ - Extract links;
12
+ - Use meta og:tags to extract information;
13
+
14
+ ## Installation
15
+
16
+ I recomend to use the [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer), so you can just use the command bellow and everything is installed:
17
+ ```shell
18
+ uv sync
19
+ ```
20
+
21
+ ## How to use and test (development only)
22
+
23
+ OhMyScrapper works in 3 stages:
24
+
25
+ 1. It collects and loads urls from a text (by default `input/_chat.txt`) in a database;
26
+ 2. It scraps/access the collected urls and read what is relevant. If it finds new urls, they are collected as well;
27
+ 3. Export a list of urls in CSV files;
28
+
29
+ You can do 3 stages with the command:
30
+ ```shell
31
+ make start
32
+ ```
33
+ > Remember to add your text file in the folder `/input` with the name `_chat.txt`!
34
+
35
+ You will find the exported files in the folder `/output` like this:
36
+ - `/output/report.csv`
37
+ - `/output/report.csv-preview.html`
38
+ - `/output/urls-simplified.csv`
39
+ - `/output/urls-simplified.csv-preview.html`
40
+ - `/output/urls.csv`
41
+ - `/output/urls.csv-preview.html`
42
+
43
+ ### BUT: if you want to do step by step, here it is:
44
+
45
+ First we load a text file you would like to look for urls, the idea here is to
46
+ use the whatsapp history, but it works with any txt file.
47
+
48
+ The default file is `input/_chat.txt`. If you have the default file you just use
49
+ the command `load`:
50
+ ```shell
51
+ make load
52
+ ```
53
+ or, if you have another file, just use the argument `-file` like this:
54
+ ```shell
55
+ uv run main.py load -file=my-text-file.txt
56
+ ```
57
+ That will create a database if it doesn't exist and store every url the oh-my-scrapper
58
+ find. After that, let's scrap the urls with the command `scrap-urls`:
59
+
60
+ ```shell
61
+ make scrap-urls
62
+ ```
63
+
64
+ That will scrap only the linkedin urls we are interested in. For now they are:
65
+ - linkedin_post: https://%.linkedin.com/posts/%
66
+ - linkedin_redirect: https://lnkd.in/%
67
+ - linkedin_job: https://%.linkedin.com/jobs/view/%
68
+ - linkedin_feed" https://%.linkedin.com/feed/%
69
+ - linkedin_company: https://%.linkedin.com/company/%
70
+
71
+ But we can use every other one generically using the argument `--ignore-type`:
72
+ ```shell
73
+ uv run main.py scrap-urls --ignore-type
74
+ ```
75
+
76
+ And we can ask to make it recursively adding the argument `--recursive`:
77
+ ```shell
78
+ uv run main.py scrap-urls --recursive
79
+ ```
80
+ > !!! important: we are not sure about blocks we can have for excess of requests
81
+
82
+ And we can finally export with the command:
83
+ ```shell
84
+ make export
85
+ ```
86
+
87
+
88
+ That's the basic usage!
89
+ But you can understand more using the help:
90
+ ```shell
91
+ uv run main.py --help
92
+ ```
@@ -0,0 +1,26 @@
1
+ [project]
2
+ name = "ohmyscrapper"
3
+ version = "0.1.1"
4
+ description = "This project aims to create a text-based scraper containing links to create a final PDF with general information about job openings."
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Cesar Cardoso gh@bouli", email = "hello@cesarcardoso.cc" }
8
+ ]
9
+ requires-python = ">=3.11"
10
+ dependencies = [
11
+ "beautifulsoup4>=4.14.3",
12
+ "google-genai>=1.55.0",
13
+ "pandas>=2.3.3",
14
+ "python-dotenv>=1.2.1",
15
+ "pyyaml>=6.0.3",
16
+ "requests>=2.32.5",
17
+ "rich>=14.2.0",
18
+ "urlextract>=1.9.0",
19
+ ]
20
+
21
+ [project.scripts]
22
+ ohmyscrapper = "ohmyscrapper:main"
23
+
24
+ [build-system]
25
+ requires = ["uv_build>=0.9.17,<0.10.0"]
26
+ build-backend = "uv_build"
@@ -0,0 +1,155 @@
1
+ import argparse
2
+
3
+ from ohmyscrapper.modules.classify_urls import classify_urls
4
+ from ohmyscrapper.modules.sniff_url import sniff_url
5
+ from ohmyscrapper.modules.load_txt import load_txt
6
+ from ohmyscrapper.modules.seed import seed
7
+ from ohmyscrapper.modules.scrap_urls import scrap_urls
8
+ from ohmyscrapper.modules.show import (
9
+ show_url,
10
+ show_urls,
11
+ show_urls_valid_prefix,
12
+ export_urls,
13
+ export_report,
14
+ )
15
+ from ohmyscrapper.modules.untouch_all import untouch_all
16
+ from ohmyscrapper.modules.process_with_ai import process_with_ai
17
+ from ohmyscrapper.modules.merge_dbs import merge_dbs
18
+
19
+
20
+ def main():
21
+ parser = argparse.ArgumentParser(prog="ohmyscrapper")
22
+ parser.add_argument("--version", action="version", version="%(prog)s v0.1.1")
23
+
24
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
25
+
26
+ ai_process_parser = subparsers.add_parser(
27
+ "process-with-ai", help="Process with AI."
28
+ )
29
+ seed_parser = subparsers.add_parser(
30
+ "seed", help="Seed database. Necessary to classify urls."
31
+ )
32
+ untouch_parser = subparsers.add_parser(
33
+ "untouch-all", help="Untouch all urls. That resets classification"
34
+ )
35
+
36
+ classify_urls_parser = subparsers.add_parser(
37
+ "classify-urls", help="Classify loaded urls"
38
+ )
39
+ classify_urls_parser.add_argument(
40
+ "--recursive", default=False, help="Run in recursive mode", action="store_true"
41
+ )
42
+
43
+ load_txt_parser = subparsers.add_parser("load", help="Load txt file")
44
+ load_txt_parser.add_argument(
45
+ "-file", default="input/_chat.txt", help="File path. Default is input/_chat.txt"
46
+ )
47
+
48
+ scrap_urls_parser = subparsers.add_parser("scrap-urls", help="Scrap urls")
49
+ scrap_urls_parser.add_argument(
50
+ "--recursive", default=False, help="Run in recursive mode", action="store_true"
51
+ )
52
+ scrap_urls_parser.add_argument(
53
+ "--ignore-type", default=False, help="Ignore urls types", action="store_true"
54
+ )
55
+ scrap_urls_parser.add_argument(
56
+ "--randomize", default=False, help="Random order", action="store_true"
57
+ )
58
+ scrap_urls_parser.add_argument(
59
+ "--only-parents", default=False, help="Only parents urls", action="store_true"
60
+ )
61
+
62
+ sniff_url_parser = subparsers.add_parser("sniff-url", help="Check url")
63
+ sniff_url_parser.add_argument(
64
+ "url", default="https://cesarcardoso.cc/", help="Url to sniff"
65
+ )
66
+
67
+ show_urls_parser = subparsers.add_parser("show", help="Show urls and prefixes")
68
+ show_urls_parser.add_argument(
69
+ "--prefixes", default=False, help="Show urls valid prefix", action="store_true"
70
+ )
71
+ show_urls_parser.add_argument("--limit", default=0, help="Limit of lines to show")
72
+ show_urls_parser.add_argument("-url", default="", help="Url to show")
73
+
74
+ export_parser = subparsers.add_parser("export", help="Export urls to csv.")
75
+ export_parser.add_argument("--limit", default=0, help="Limit of lines to export")
76
+ export_parser.add_argument(
77
+ "--file",
78
+ default="output/urls.csv",
79
+ help="File path. Default is output/urls.csv",
80
+ )
81
+ export_parser.add_argument(
82
+ "--simplify",
83
+ default=False,
84
+ help="Ignore json and descriptions",
85
+ action="store_true",
86
+ )
87
+
88
+ report_parser = subparsers.add_parser("report", help="Export urls report to csv.")
89
+ merge_parser = subparsers.add_parser("merge_dbs", help="Merge databases.")
90
+
91
+ # TODO: What is that?
92
+ # seed_parser.set_defaults(func=seed)
93
+ # classify_urls_parser.set_defaults(func=classify_urls)
94
+ # load_txt_parser.set_defaults(func=load_txt)
95
+
96
+ args = parser.parse_args()
97
+
98
+ if args.command == "classify-urls":
99
+ classify_urls(args.recursive)
100
+ return
101
+
102
+ if args.command == "load":
103
+ load_txt(args.file)
104
+ return
105
+
106
+ if args.command == "seed":
107
+ seed()
108
+ return
109
+
110
+ if args.command == "untouch-all":
111
+ untouch_all()
112
+ return
113
+
114
+ if args.command == "sniff-url":
115
+ sniff_url(args.url)
116
+ return
117
+
118
+ if args.command == "scrap-urls":
119
+ scrap_urls(
120
+ recursive=args.recursive,
121
+ ignore_valid_prefix=args.ignore_type,
122
+ randomize=args.randomize,
123
+ only_parents=args.only_parents,
124
+ )
125
+ return
126
+
127
+ if args.command == "show":
128
+ if args.prefixes:
129
+ show_urls_valid_prefix(int(args.limit))
130
+ return
131
+ if args.url != "":
132
+ show_url(args.url)
133
+ return
134
+ show_urls(int(args.limit))
135
+ return
136
+
137
+ if args.command == "export":
138
+ export_urls(limit=int(args.limit), csv_file=args.file, simplify=args.simplify)
139
+ return
140
+
141
+ if args.command == "process-with-ai":
142
+ process_with_ai()
143
+ return
144
+
145
+ if args.command == "report":
146
+ export_report()
147
+ return
148
+
149
+ if args.command == "merge_dbs":
150
+ merge_dbs()
151
+ return
152
+
153
+
154
+ if __name__ == "__main__":
155
+ main()
@@ -0,0 +1,4 @@
1
+ from . import main
2
+
3
+ if __name__ == "__main__":
4
+ main()