ohmyscrapper 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,110 @@
1
+ Metadata-Version: 2.3
2
+ Name: ohmyscrapper
3
+ Version: 0.2.1
4
+ Summary: This project aims to create a text-based scraper containing links to create a final PDF with general information about job openings.
5
+ Author: Cesar Cardoso gh@bouli
6
+ Author-email: Cesar Cardoso gh@bouli <hello@cesarcardoso.cc>
7
+ Requires-Dist: beautifulsoup4>=4.14.3
8
+ Requires-Dist: google-genai>=1.55.0
9
+ Requires-Dist: markdown>=3.10
10
+ Requires-Dist: pandas>=2.3.3
11
+ Requires-Dist: python-dotenv>=1.2.1
12
+ Requires-Dist: pyyaml>=6.0.3
13
+ Requires-Dist: requests>=2.32.5
14
+ Requires-Dist: rich>=14.2.0
15
+ Requires-Dist: urlextract>=1.9.0
16
+ Requires-Python: >=3.11
17
+ Description-Content-Type: text/markdown
18
+
19
+ # OhMyScrapper - v0.2.1
20
+
21
+ This project aims to create a text-based scraper containing links to create a
22
+ final PDF with general information about job openings.
23
+
24
+ > This project is using [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer) by default.
25
+
26
+ ## Scope
27
+
28
+ - Read texts;
29
+ - Extract links;
30
+ - Use meta og:tags to extract information;
31
+
32
+ ## Installation
33
+
34
+ I recomend to use the [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer), so you can just use the command bellow and everything is installed:
35
+ ```shell
36
+ uv sync
37
+ ```
38
+
39
+ ## How to use and test (development only)
40
+
41
+ OhMyScrapper works in 3 stages:
42
+
43
+ 1. It collects and loads urls from a text (by default `input/_chat.txt`) in a database;
44
+ 2. It scraps/access the collected urls and read what is relevant. If it finds new urls, they are collected as well;
45
+ 3. Export a list of urls in CSV files;
46
+
47
+ You can do 3 stages with the command:
48
+ ```shell
49
+ make start
50
+ ```
51
+ > Remember to add your text file in the folder `/input` with the name `_chat.txt`!
52
+
53
+ You will find the exported files in the folder `/output` like this:
54
+ - `/output/report.csv`
55
+ - `/output/report.csv-preview.html`
56
+ - `/output/urls-simplified.csv`
57
+ - `/output/urls-simplified.csv-preview.html`
58
+ - `/output/urls.csv`
59
+ - `/output/urls.csv-preview.html`
60
+
61
+ ### BUT: if you want to do step by step, here it is:
62
+
63
+ First we load a text file you would like to look for urls, the idea here is to
64
+ use the whatsapp history, but it works with any txt file.
65
+
66
+ The default file is `input/_chat.txt`. If you have the default file you just use
67
+ the command `load`:
68
+ ```shell
69
+ make load
70
+ ```
71
+ or, if you have another file, just use the argument `-file` like this:
72
+ ```shell
73
+ uv run main.py load -file=my-text-file.txt
74
+ ```
75
+ That will create a database if it doesn't exist and store every url the oh-my-scrapper
76
+ find. After that, let's scrap the urls with the command `scrap-urls`:
77
+
78
+ ```shell
79
+ make scrap-urls
80
+ ```
81
+
82
+ That will scrap only the linkedin urls we are interested in. For now they are:
83
+ - linkedin_post: https://%.linkedin.com/posts/%
84
+ - linkedin_redirect: https://lnkd.in/%
85
+ - linkedin_job: https://%.linkedin.com/jobs/view/%
86
+ - linkedin_feed" https://%.linkedin.com/feed/%
87
+ - linkedin_company: https://%.linkedin.com/company/%
88
+
89
+ But we can use every other one generically using the argument `--ignore-type`:
90
+ ```shell
91
+ uv run main.py scrap-urls --ignore-type
92
+ ```
93
+
94
+ And we can ask to make it recursively adding the argument `--recursive`:
95
+ ```shell
96
+ uv run main.py scrap-urls --recursive
97
+ ```
98
+ > !!! important: we are not sure about blocks we can have for excess of requests
99
+
100
+ And we can finally export with the command:
101
+ ```shell
102
+ make export
103
+ ```
104
+
105
+
106
+ That's the basic usage!
107
+ But you can understand more using the help:
108
+ ```shell
109
+ uv run main.py --help
110
+ ```
@@ -0,0 +1,92 @@
1
+ # OhMyScrapper - v0.2.1
2
+
3
+ This project aims to create a text-based scraper containing links to create a
4
+ final PDF with general information about job openings.
5
+
6
+ > This project is using [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer) by default.
7
+
8
+ ## Scope
9
+
10
+ - Read texts;
11
+ - Extract links;
12
+ - Use meta og:tags to extract information;
13
+
14
+ ## Installation
15
+
16
+ I recomend to use the [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer), so you can just use the command bellow and everything is installed:
17
+ ```shell
18
+ uv sync
19
+ ```
20
+
21
+ ## How to use and test (development only)
22
+
23
+ OhMyScrapper works in 3 stages:
24
+
25
+ 1. It collects and loads urls from a text (by default `input/_chat.txt`) in a database;
26
+ 2. It scraps/access the collected urls and read what is relevant. If it finds new urls, they are collected as well;
27
+ 3. Export a list of urls in CSV files;
28
+
29
+ You can do 3 stages with the command:
30
+ ```shell
31
+ make start
32
+ ```
33
+ > Remember to add your text file in the folder `/input` with the name `_chat.txt`!
34
+
35
+ You will find the exported files in the folder `/output` like this:
36
+ - `/output/report.csv`
37
+ - `/output/report.csv-preview.html`
38
+ - `/output/urls-simplified.csv`
39
+ - `/output/urls-simplified.csv-preview.html`
40
+ - `/output/urls.csv`
41
+ - `/output/urls.csv-preview.html`
42
+
43
+ ### BUT: if you want to do step by step, here it is:
44
+
45
+ First we load a text file you would like to look for urls, the idea here is to
46
+ use the whatsapp history, but it works with any txt file.
47
+
48
+ The default file is `input/_chat.txt`. If you have the default file you just use
49
+ the command `load`:
50
+ ```shell
51
+ make load
52
+ ```
53
+ or, if you have another file, just use the argument `-file` like this:
54
+ ```shell
55
+ uv run main.py load -file=my-text-file.txt
56
+ ```
57
+ That will create a database if it doesn't exist and store every url the oh-my-scrapper
58
+ find. After that, let's scrap the urls with the command `scrap-urls`:
59
+
60
+ ```shell
61
+ make scrap-urls
62
+ ```
63
+
64
+ That will scrap only the linkedin urls we are interested in. For now they are:
65
+ - linkedin_post: https://%.linkedin.com/posts/%
66
+ - linkedin_redirect: https://lnkd.in/%
67
+ - linkedin_job: https://%.linkedin.com/jobs/view/%
68
+ - linkedin_feed" https://%.linkedin.com/feed/%
69
+ - linkedin_company: https://%.linkedin.com/company/%
70
+
71
+ But we can use every other one generically using the argument `--ignore-type`:
72
+ ```shell
73
+ uv run main.py scrap-urls --ignore-type
74
+ ```
75
+
76
+ And we can ask to make it recursively adding the argument `--recursive`:
77
+ ```shell
78
+ uv run main.py scrap-urls --recursive
79
+ ```
80
+ > !!! important: we are not sure about blocks we can have for excess of requests
81
+
82
+ And we can finally export with the command:
83
+ ```shell
84
+ make export
85
+ ```
86
+
87
+
88
+ That's the basic usage!
89
+ But you can understand more using the help:
90
+ ```shell
91
+ uv run main.py --help
92
+ ```
@@ -0,0 +1,42 @@
1
+ [project]
2
+ name = "ohmyscrapper"
3
+ version = "0.2.1"
4
+ description = "This project aims to create a text-based scraper containing links to create a final PDF with general information about job openings."
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Cesar Cardoso gh@bouli", email = "hello@cesarcardoso.cc" }
8
+ ]
9
+ requires-python = ">=3.11"
10
+ dependencies = [
11
+ "beautifulsoup4>=4.14.3",
12
+ "google-genai>=1.55.0",
13
+ "markdown>=3.10",
14
+ "pandas>=2.3.3",
15
+ "python-dotenv>=1.2.1",
16
+ "pyyaml>=6.0.3",
17
+ "requests>=2.32.5",
18
+ "rich>=14.2.0",
19
+ "urlextract>=1.9.0",
20
+ ]
21
+
22
+ [project.scripts]
23
+ ohmyscrapper = "ohmyscrapper:main"
24
+
25
+ [build-system]
26
+ requires = ["uv_build>=0.9.17,<0.10.0"]
27
+ build-backend = "uv_build"
28
+
29
+ [tool.bumpversion]
30
+ tag = true
31
+ tag_name = "v{new_version}"
32
+ pre_commit_hooks = ["uv sync --upgrade", "git add uv.lock"]
33
+ commit = true
34
+
35
+ [[tool.bumpversion.files]]
36
+ filename = "pyproject.toml"
37
+
38
+ [[tool.bumpversion.files]]
39
+ filename = "README.md"
40
+
41
+ [[tool.bumpversion.files]]
42
+ filename = "src/ohmyscrapper/__init__.py"
@@ -0,0 +1,162 @@
1
+ import argparse
2
+
3
+ from ohmyscrapper.modules.classify_urls import classify_urls
4
+ from ohmyscrapper.modules.sniff_url import sniff_url
5
+ from ohmyscrapper.modules.load_txt import load_txt
6
+ from ohmyscrapper.modules.seed import seed
7
+ from ohmyscrapper.modules.scrap_urls import scrap_urls
8
+ from ohmyscrapper.modules.show import (
9
+ show_url,
10
+ show_urls,
11
+ show_urls_valid_prefix,
12
+ export_urls,
13
+ export_report,
14
+ )
15
+ from ohmyscrapper.modules.untouch_all import untouch_all
16
+ from ohmyscrapper.modules.process_with_ai import process_with_ai, reprocess_ai_history
17
+ from ohmyscrapper.modules.merge_dbs import merge_dbs
18
+
19
+
20
+ def main():
21
+ parser = argparse.ArgumentParser(prog="ohmyscrapper")
22
+ parser.add_argument("--version", action="version", version="%(prog)s v0.2.1")
23
+
24
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
25
+
26
+ ai_process_parser = subparsers.add_parser(
27
+ "process-with-ai", help="Process with AI."
28
+ )
29
+ ai_process_parser.add_argument(
30
+ "--history", default=False, help="Reprocess ai history", action="store_true"
31
+ )
32
+
33
+ seed_parser = subparsers.add_parser(
34
+ "seed", help="Seed database. Necessary to classify urls."
35
+ )
36
+ untouch_parser = subparsers.add_parser(
37
+ "untouch-all", help="Untouch all urls. That resets classification"
38
+ )
39
+
40
+ classify_urls_parser = subparsers.add_parser(
41
+ "classify-urls", help="Classify loaded urls"
42
+ )
43
+ classify_urls_parser.add_argument(
44
+ "--recursive", default=False, help="Run in recursive mode", action="store_true"
45
+ )
46
+
47
+ load_txt_parser = subparsers.add_parser("load", help="Load txt file")
48
+ load_txt_parser.add_argument(
49
+ "-file", default="input/_chat.txt", help="File path. Default is input/_chat.txt"
50
+ )
51
+
52
+ scrap_urls_parser = subparsers.add_parser("scrap-urls", help="Scrap urls")
53
+ scrap_urls_parser.add_argument(
54
+ "--recursive", default=False, help="Run in recursive mode", action="store_true"
55
+ )
56
+ scrap_urls_parser.add_argument(
57
+ "--ignore-type", default=False, help="Ignore urls types", action="store_true"
58
+ )
59
+ scrap_urls_parser.add_argument(
60
+ "--randomize", default=False, help="Random order", action="store_true"
61
+ )
62
+ scrap_urls_parser.add_argument(
63
+ "--only-parents", default=False, help="Only parents urls", action="store_true"
64
+ )
65
+
66
+ sniff_url_parser = subparsers.add_parser("sniff-url", help="Check url")
67
+ sniff_url_parser.add_argument(
68
+ "url", default="https://cesarcardoso.cc/", help="Url to sniff"
69
+ )
70
+
71
+ show_urls_parser = subparsers.add_parser("show", help="Show urls and prefixes")
72
+ show_urls_parser.add_argument(
73
+ "--prefixes", default=False, help="Show urls valid prefix", action="store_true"
74
+ )
75
+ show_urls_parser.add_argument("--limit", default=0, help="Limit of lines to show")
76
+ show_urls_parser.add_argument("-url", default="", help="Url to show")
77
+
78
+ export_parser = subparsers.add_parser("export", help="Export urls to csv.")
79
+ export_parser.add_argument("--limit", default=0, help="Limit of lines to export")
80
+ export_parser.add_argument(
81
+ "--file",
82
+ default="output/urls.csv",
83
+ help="File path. Default is output/urls.csv",
84
+ )
85
+ export_parser.add_argument(
86
+ "--simplify",
87
+ default=False,
88
+ help="Ignore json and descriptions",
89
+ action="store_true",
90
+ )
91
+
92
+ report_parser = subparsers.add_parser("report", help="Export urls report to csv.")
93
+ merge_parser = subparsers.add_parser("merge_dbs", help="Merge databases.")
94
+
95
+ # TODO: What is that?
96
+ # seed_parser.set_defaults(func=seed)
97
+ # classify_urls_parser.set_defaults(func=classify_urls)
98
+ # load_txt_parser.set_defaults(func=load_txt)
99
+
100
+ args = parser.parse_args()
101
+
102
+ if args.command == "classify-urls":
103
+ classify_urls(args.recursive)
104
+ return
105
+
106
+ if args.command == "load":
107
+ load_txt(args.file)
108
+ return
109
+
110
+ if args.command == "seed":
111
+ seed()
112
+ return
113
+
114
+ if args.command == "untouch-all":
115
+ untouch_all()
116
+ return
117
+
118
+ if args.command == "sniff-url":
119
+ sniff_url(args.url)
120
+ return
121
+
122
+ if args.command == "scrap-urls":
123
+ scrap_urls(
124
+ recursive=args.recursive,
125
+ ignore_valid_prefix=args.ignore_type,
126
+ randomize=args.randomize,
127
+ only_parents=args.only_parents,
128
+ )
129
+ return
130
+
131
+ if args.command == "show":
132
+ if args.prefixes:
133
+ show_urls_valid_prefix(int(args.limit))
134
+ return
135
+ if args.url != "":
136
+ show_url(args.url)
137
+ return
138
+ show_urls(int(args.limit))
139
+ return
140
+
141
+ if args.command == "export":
142
+ export_urls(limit=int(args.limit), csv_file=args.file, simplify=args.simplify)
143
+ return
144
+
145
+ if args.command == "process-with-ai":
146
+ if args.history:
147
+ reprocess_ai_history()
148
+ else:
149
+ process_with_ai()
150
+ return
151
+
152
+ if args.command == "report":
153
+ export_report()
154
+ return
155
+
156
+ if args.command == "merge_dbs":
157
+ merge_dbs()
158
+ return
159
+
160
+
161
+ if __name__ == "__main__":
162
+ main()
@@ -0,0 +1,4 @@
1
+ from . import main
2
+
3
+ if __name__ == "__main__":
4
+ main()