ohmyscrapper 0.2.3__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ohmyscrapper/__init__.py CHANGED
@@ -3,7 +3,7 @@ import argparse
3
3
  from ohmyscrapper.modules.classify_urls import classify_urls
4
4
  from ohmyscrapper.modules.sniff_url import sniff_url
5
5
  from ohmyscrapper.modules.load_txt import load_txt
6
- from ohmyscrapper.modules.seed import seed
6
+ from ohmyscrapper.modules.seed import seed, export_url_types_to_file
7
7
  from ohmyscrapper.modules.scrap_urls import scrap_urls
8
8
  from ohmyscrapper.modules.show import (
9
9
  show_url,
@@ -15,30 +15,40 @@ from ohmyscrapper.modules.show import (
15
15
  from ohmyscrapper.modules.untouch_all import untouch_all
16
16
  from ohmyscrapper.modules.process_with_ai import process_with_ai, reprocess_ai_history
17
17
  from ohmyscrapper.modules.merge_dbs import merge_dbs
18
+ from ohmyscrapper.core.config import update
18
19
 
19
20
 
20
21
  def main():
21
22
  parser = argparse.ArgumentParser(prog="ohmyscrapper")
22
- parser.add_argument("--version", action="version", version="%(prog)s v0.2.3")
23
+ parser.add_argument("--version", action="version", version="%(prog)s v0.7.0")
23
24
 
25
+ update()
24
26
  subparsers = parser.add_subparsers(dest="command", help="Available commands")
25
27
  start_parser = subparsers.add_parser(
26
- "start", help="Make the entire process of loading, processing and exporting with the default configuration."
28
+ "start",
29
+ help="Make the entire process of 📦 loading, 🐶 scraping and 📜🖋️ exporting with the default configuration.",
27
30
  )
28
31
 
29
32
  start_parser.add_argument(
30
- "--ai", default=False, help="Make the entire process of loading, processing, reprocessing with AI and exporting with the default configuration.", action="store_true"
33
+ "--ai",
34
+ default=False,
35
+ help="Make the entire process of loading, processing, reprocessing with AI and exporting with the default configuration.",
36
+ action="store_true",
31
37
  )
32
38
 
33
- ai_process_parser = subparsers.add_parser(
34
- "ai", help="Process with AI."
35
- )
39
+ ai_process_parser = subparsers.add_parser("ai", help="Process with AI.")
36
40
  ai_process_parser.add_argument(
37
41
  "--history", default=False, help="Reprocess ai history", action="store_true"
38
42
  )
39
43
 
40
44
  seed_parser = subparsers.add_parser(
41
- "seed", help="Seed database. Necessary to classify urls."
45
+ "seed", help="Seed database with `url_types` to classify the `urls`."
46
+ )
47
+ seed_parser.add_argument(
48
+ "--export",
49
+ default=False,
50
+ help="Add all `url_types` from the bank to the `/ohmyscrapper/url_types.yaml` file.",
51
+ action="store_true",
42
52
  )
43
53
  untouch_parser = subparsers.add_parser(
44
54
  "untouch-all", help="Untouch all urls. That resets classification"
@@ -51,12 +61,15 @@ def main():
51
61
  "--recursive", default=False, help="Run in recursive mode", action="store_true"
52
62
  )
53
63
 
54
- load_txt_parser = subparsers.add_parser("load", help="Load txt file")
64
+ load_txt_parser = subparsers.add_parser("load", help="📦 Load txt file")
65
+ load_txt_parser.add_argument(
66
+ "-input", default=None, help="File/Folder path or url."
67
+ )
55
68
  load_txt_parser.add_argument(
56
- "-file", default="input/_chat.txt", help="File path. Default is input/_chat.txt"
69
+ "--verbose", default=False, help="Run in verbose mode", action="store_true"
57
70
  )
58
71
 
59
- scrap_urls_parser = subparsers.add_parser("scrap-urls", help="Scrap urls")
72
+ scrap_urls_parser = subparsers.add_parser("scrap-urls", help="🐶 Scrap urls")
60
73
  scrap_urls_parser.add_argument(
61
74
  "--recursive", default=False, help="Run in recursive mode", action="store_true"
62
75
  )
@@ -69,8 +82,11 @@ def main():
69
82
  scrap_urls_parser.add_argument(
70
83
  "--only-parents", default=False, help="Only parents urls", action="store_true"
71
84
  )
85
+ scrap_urls_parser.add_argument(
86
+ "--verbose", default=False, help="Run in verbose mode", action="store_true"
87
+ )
72
88
 
73
- sniff_url_parser = subparsers.add_parser("sniff-url", help="Check url")
89
+ sniff_url_parser = subparsers.add_parser("sniff-url", help="🐕 Sniff/Check url")
74
90
  sniff_url_parser.add_argument(
75
91
  "url", default="https://cesarcardoso.cc/", help="Url to sniff"
76
92
  )
@@ -82,7 +98,7 @@ def main():
82
98
  show_urls_parser.add_argument("--limit", default=0, help="Limit of lines to show")
83
99
  show_urls_parser.add_argument("-url", default="", help="Url to show")
84
100
 
85
- export_parser = subparsers.add_parser("export", help="Export urls to csv.")
101
+ export_parser = subparsers.add_parser("export", help="📊🖋️ Export urls to csv.")
86
102
  export_parser.add_argument("--limit", default=0, help="Limit of lines to export")
87
103
  export_parser.add_argument(
88
104
  "--file",
@@ -96,14 +112,11 @@ def main():
96
112
  action="store_true",
97
113
  )
98
114
 
99
- report_parser = subparsers.add_parser("report", help="Export urls report to csv.")
115
+ report_parser = subparsers.add_parser(
116
+ "report", help="📜🖋️ Export urls report to csv."
117
+ )
100
118
  merge_parser = subparsers.add_parser("merge_dbs", help="Merge databases.")
101
119
 
102
- # TODO: What is that?
103
- # seed_parser.set_defaults(func=seed)
104
- # classify_urls_parser.set_defaults(func=classify_urls)
105
- # load_txt_parser.set_defaults(func=load_txt)
106
-
107
120
  args = parser.parse_args()
108
121
 
109
122
  if args.command == "classify-urls":
@@ -111,11 +124,14 @@ def main():
111
124
  return
112
125
 
113
126
  if args.command == "load":
114
- load_txt(args.file)
127
+ load_txt(file_name=args.input, verbose=args.verbose)
115
128
  return
116
129
 
117
130
  if args.command == "seed":
118
- seed()
131
+ if args.export:
132
+ export_url_types_to_file()
133
+ else:
134
+ seed()
119
135
  return
120
136
 
121
137
  if args.command == "untouch-all":
@@ -132,6 +148,7 @@ def main():
132
148
  ignore_valid_prefix=args.ignore_type,
133
149
  randomize=args.randomize,
134
150
  only_parents=args.only_parents,
151
+ verbose=args.verbose,
135
152
  )
136
153
  return
137
154
 
@@ -166,7 +183,12 @@ def main():
166
183
 
167
184
  if args.command == "start":
168
185
  load_txt()
169
- scrap_urls(recursive=True,ignore_valid_prefix=True,randomize=False,only_parents=False)
186
+ scrap_urls(
187
+ recursive=True,
188
+ ignore_valid_prefix=True,
189
+ randomize=False,
190
+ only_parents=False,
191
+ )
170
192
  if args.ai:
171
193
  process_with_ai()
172
194
  export_urls()
@@ -0,0 +1,107 @@
1
+ import os
2
+ from ohmyscrapper.core import config_files
3
+
4
+ default_app_dir = "ohmyscrapper"
5
+
6
+
7
+ def get_dir(param="ohmyscrapper"):
8
+ parent_param = "default_dirs"
9
+
10
+ if param == default_app_dir:
11
+ folder = "./" + param
12
+ else:
13
+ folder = config_files.get_param(
14
+ parent_param=parent_param, param=param, default_app_dir=default_app_dir
15
+ )
16
+ if not os.path.exists(folder):
17
+ os.mkdir(folder)
18
+ return folder
19
+
20
+
21
+ def get_files(param):
22
+ parent_param = "default_files"
23
+ return config_files.get_param(
24
+ parent_param=parent_param, param=param, default_app_dir=default_app_dir
25
+ )
26
+
27
+
28
+ def get_db(param="db_file"):
29
+ if param == "folder":
30
+ return get_dir(param="db")
31
+ return config_files.get_param(
32
+ parent_param="db", param=param, default_app_dir=default_app_dir
33
+ )
34
+
35
+
36
+ def get_ai(param):
37
+ return config_files.get_param(
38
+ parent_param="ai", param=param, default_app_dir=default_app_dir
39
+ )
40
+
41
+
42
+ def load_config(force_default=False):
43
+ config_file_name = "config.yaml"
44
+ config_params = config_files.create_and_read_config_file(
45
+ file_name=config_file_name,
46
+ default_app_dir=default_app_dir,
47
+ force_default=force_default,
48
+ )
49
+
50
+ if config_params is None or "default_dirs" not in config_params:
51
+ config_params = load_config(force_default=True)
52
+
53
+ return config_params
54
+
55
+
56
+ def url_types_file_exists():
57
+ url_types_file = get_files("url_types")
58
+ return config_files.config_file_exists(
59
+ url_types_file, default_app_dir=default_app_dir
60
+ )
61
+
62
+
63
+ def get_url_types():
64
+ url_types_file = get_files("url_types")
65
+ return config_files.create_and_read_config_file(
66
+ url_types_file, default_app_dir=default_app_dir
67
+ )
68
+
69
+
70
+ def get_url_sniffing():
71
+ file = get_files("url_sniffing")
72
+ return config_files.create_and_read_config_file(
73
+ file, default_app_dir=default_app_dir
74
+ )
75
+
76
+
77
+ def append_url_sniffing(data):
78
+ file = get_files("url_sniffing")
79
+ _append_config_file(data, file)
80
+
81
+
82
+ def append_url_types(url_types):
83
+ url_types_file = get_files("url_types")
84
+ _append_config_file(url_types, url_types_file)
85
+
86
+
87
+ def overwrite_config_file(data, file_name):
88
+ config_files.overwrite_config_file(data, file_name, default_app_dir=default_app_dir)
89
+
90
+
91
+ def _append_config_file(data, file_name):
92
+ config_files.append_config_file(data, file_name, default_app_dir=default_app_dir)
93
+
94
+
95
+ def update():
96
+ legacy_folder = "./customize"
97
+ new_folder = "./ohmyscrapper"
98
+ if os.path.exists(legacy_folder) and not os.path.exists(new_folder):
99
+ yes_no = input(
100
+ "We detected a legacy folder system for your OhMyScrapper, would you like to update? \n"
101
+ "If you don't update, a new version will be used and your legacy folder will be ignored. \n"
102
+ "[Y] for yes or any other thing to ignore: "
103
+ )
104
+ if yes_no == "Y":
105
+ os.rename(legacy_folder, new_folder)
106
+ print(" You are up-to-date! =)")
107
+ print("")
@@ -0,0 +1,73 @@
1
+ import os
2
+ import yaml
3
+
4
+
5
+ def create_and_read_config_file(file_name, default_app_dir, force_default=False):
6
+ config_file = config_file_path(file_name, default_app_dir)
7
+ if force_default or not os.path.exists(config_file):
8
+ config_params = _get_default_file(default_file=file_name)
9
+ overwrite_config_file(
10
+ data=config_params, file_name=file_name, default_app_dir=default_app_dir
11
+ )
12
+ else:
13
+ with open(config_file, "r") as f:
14
+ config_params = yaml.safe_load(f.read())
15
+ if config_params is None:
16
+ config_params = create_and_read_config_file(
17
+ file_name=file_name, default_app_dir=default_app_dir, force_default=True
18
+ )
19
+ return config_params
20
+
21
+
22
+ def overwrite_config_file(data, file_name, default_app_dir):
23
+ config_file = config_file_path(file_name, default_app_dir)
24
+ with open(config_file, "+w") as f:
25
+ f.write(yaml.safe_dump(data))
26
+
27
+
28
+ def append_config_file(data, file_name, default_app_dir):
29
+ config_file = config_file_path(file_name, default_app_dir)
30
+ # append
31
+ with open(config_file, "+a") as f:
32
+ yaml.dump(data, f, allow_unicode=True)
33
+ # read
34
+ with open(config_file, "r") as f:
35
+ data = yaml.safe_load(f.read())
36
+ # overwrite preventing repetition
37
+ with open(config_file, "w") as f:
38
+ yaml.dump(data, f, allow_unicode=True)
39
+
40
+
41
+ def get_param(parent_param, param, default_app_dir):
42
+ default_dirs = create_and_read_config_file(
43
+ file_name="config.yaml", default_app_dir=default_app_dir
44
+ )[parent_param]
45
+
46
+ if param in default_dirs:
47
+ return default_dirs[param]
48
+ else:
49
+ raise Exception(f"{param} do not exist in your params {parent_param}.")
50
+
51
+
52
+ def config_file_exists(file_name, default_app_dir):
53
+ return os.path.exists(config_file_path(file_name, default_app_dir))
54
+
55
+
56
+ def config_file_path(file_name, default_app_dir):
57
+ _ensure_default_app_dir(default_app_dir)
58
+ config_file = os.path.join(default_app_dir, file_name)
59
+ return config_file
60
+
61
+
62
+ def _ensure_default_app_dir(default_app_dir):
63
+ if not os.path.exists(default_app_dir):
64
+ os.mkdir(default_app_dir)
65
+
66
+
67
+ def _get_default_file(default_file):
68
+ default_files_dir = os.path.join(
69
+ os.path.dirname(os.path.realpath(__file__)), "default_files"
70
+ )
71
+ default_file = os.path.join(default_files_dir, default_file)
72
+ with open(default_file, "r") as f:
73
+ return yaml.safe_load(f.read())
@@ -0,0 +1,16 @@
1
+ db:
2
+ db_file: local.db
3
+
4
+ default_dirs:
5
+ db: ./db
6
+ input: ./input
7
+ output: ./output
8
+ prompts: ./prompts
9
+ templates: ./templates
10
+
11
+ default_files:
12
+ url_types: url_types.yaml
13
+ url_sniffing: url_sniffing.yaml
14
+
15
+ ai:
16
+ default_prompt_file: prompt.md
@@ -0,0 +1,25 @@
1
+ linkedin_feed:
2
+ metatags:
3
+ og:url: url_destiny
4
+
5
+ linkedin_job:
6
+ bodytags:
7
+ h1: title
8
+ metatags:
9
+ og:title: title
10
+ og:description: description
11
+ description: description
12
+
13
+ linkedin_post:
14
+ bodytags:
15
+ h1: title
16
+ metatags:
17
+ og:title: title
18
+ og:description: description
19
+ description: description
20
+
21
+ linkedin_redirect:
22
+ metatags:
23
+ og:url: url_destiny
24
+ atags:
25
+ first-tag-as-url_destiny: 5
@@ -0,0 +1,5 @@
1
+ linkedin_company: https://%.linkedin.com/company/%
2
+ linkedin_feed: https://%.linkedin.com/feed/%
3
+ linkedin_job: https://%.linkedin.com/jobs/view/%
4
+ linkedin_post: https://%.linkedin.com/posts/%
5
+ linkedin_redirect: https://lnkd.in/%