poodigger 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Asinerum Conlang Project
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,25 @@
1
+ Metadata-Version: 2.4
2
+ Name: poodigger
3
+ Version: 1.0.0
4
+ Summary: Blogger Comment Digging Toolkit
5
+ Home-page: https://github.com/asinerum/poodigger
6
+ Author: Asinerum Conlang Project
7
+ Author-email: asinerum.com@gmail.com
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.7
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: bs4>=0.0.2
16
+ Requires-Dist: duckdb>=1.4.2
17
+ Requires-Dist: formatize>=1.0.3
18
+ Requires-Dist: jinja2>=3.1.6
19
+ Requires-Dist: pandas>=2.3.3
20
+ Dynamic: license-file
21
+
22
+ Detailed tips, tricks, and examples, can be found at project's repository
23
+ https://github.com/asinerum/poodigger
24
+
25
+ (C) 2026 Asinerum Conlang Project
@@ -0,0 +1,4 @@
1
+ Detailed tips, tricks, and examples, can be found at project's repository
2
+ https://github.com/asinerum/poodigger
3
+
4
+ (C) 2026 Asinerum Conlang Project
@@ -0,0 +1,3 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
@@ -0,0 +1,34 @@
1
+ [metadata]
2
+ name = poodigger
3
+ version = 1.0.0
4
+ author = Asinerum Conlang Project
5
+ author_email = asinerum.com@gmail.com
6
+ description = Blogger Comment Digging Toolkit
7
+ long_description = file: README.md
8
+ long_description_content_type = text/markdown
9
+ url = https://github.com/asinerum/poodigger
10
+ license = MIT
11
+ classifiers =
12
+ Programming Language :: Python :: 3
13
+ License :: OSI Approved :: MIT License
14
+ Operating System :: OS Independent
15
+
16
+ [options]
17
+ package_dir =
18
+ = src
19
+ packages = find:
20
+ python_requires = >=3.7
21
+ install_requires =
22
+ bs4 >= 0.0.2
23
+ duckdb >= 1.4.2
24
+ formatize >= 1.0.3
25
+ jinja2 >= 3.1.6
26
+ pandas >= 2.3.3
27
+
28
+ [options.packages.find]
29
+ where = src
30
+
31
+ [egg_info]
32
+ tag_build =
33
+ tag_date = 0
34
+
@@ -0,0 +1,3 @@
1
+ from .parquet import *
2
+
3
+ __version__ = "1.0.0"
@@ -0,0 +1,312 @@
1
+ import re
2
+ import sys
3
+ import duckdb
4
+ import jinja2
5
+ import requests
6
+ import pandas as pd
7
+ from pathlib import Path
8
+ from bs4 import BeautifulSoup
9
+ from formatize import parse_date
10
+
11
+ def wormsave (fromdate: str, todate: str, **kwargs):
12
+ posts = wormdig(fromdate, todate, **kwargs)
13
+ if not posts: return print('Nothing saved')
14
+ if not this.TEMPLATES: load_templates() ## Load from CDN
15
+ target = kwargs.get('target') or kwargs.get('dir') or 'test'
16
+ print('==================================')
17
+ print('============= SAVING =============')
18
+ print('==================================')
19
+ for post in posts:
20
+ file = post['file']
21
+ if (kwargs.get('pte') or kwargs.get('author')) and not post.get('pte'):
22
+ print(f'No content for {file}')
23
+ continue
24
+ html = render_template('HTML_FORM', {
25
+ 'head': this.TEMPLATES['HTML_STYLE'],
26
+ 'body': render_template('POST_FORM', post),
27
+ })
28
+ try:
29
+ print(f'Saving {file}')
30
+ save_file(html, f'{target}/{file}')
31
+ except Exception as e:
32
+ print(f'File error: {e}')
33
+ print('Done')
34
+
35
+ def wormdig (fromdate: str, todate: str, **kwargs) -> list:
36
+ posts = duckdig(fromdate, todate)
37
+ if not posts: return print('No data available')
38
+ page = int(kwargs.get('page') or kwargs.get('pageno') or 1)
39
+ pages = int(kwargs.get('pages') or 0) ## Total pages from [page]
40
+ wormhole = kwargs.get('wormhole') or 'comments' ## Can be customized on use
41
+ pte = kwargs.get('pte') or kwargs.get('author') ## Private comment as of Bloggerize JS module
42
+ posts = [x for x in posts if (p:=int(x['pageno'])) >= page and (p < page+pages or pages == 0)]
43
+ if pte:
44
+ for post in posts:
45
+ soup = BeautifulSoup(post[wormhole], 'html.parser')
46
+ coms = soup.find_all('comdiv')
47
+ for com in coms:
48
+ authorurl = com.find('a', {'class': 'comment-authorurl'})['href']
49
+ if not re.search(r'' + pte, authorurl, re.IGNORECASE): com.decompose()
50
+ if soup.find('comdiv'): post['pte'] = True
51
+ post[wormhole] = str(soup)
52
+ return posts
53
+
54
+ def duckdig (fromdate: str, todate: str) -> list:
55
+ try:
56
+ conn = db_connect()
57
+ df = db_load(conn, fromdate, todate)
58
+ db_reform(conn, df)
59
+ print('Duck dig done')
60
+ return db_select(conn)
61
+ except Exception as e:
62
+ return print(f'Error occurred: {e}')
63
+
64
+ def db_reform (conn, df: pd.DataFrame):
65
+ db_drop(conn)
66
+ limit = len(this.COLUMNS)
67
+ for i in range(0, int(len(df)/limit)):
68
+ db_insert(conn, df, i)
69
+ print('Temporary data table created')
70
+
71
+ def db_load (conn, fromdate: str, todate: str, parloc: str=None) -> pd.DataFrame:
72
+ fromdate, todate, parquet = arrange_dates(fromdate, todate, parloc)
73
+ if not parquet: return None
74
+ df = db_parquet(conn, parquet, parloc)
75
+ limit = len(this.COLUMNS)
76
+ startindex = None
77
+ endindex = None
78
+ for i in range(0, len(df), limit):
79
+ dated = df[i][:10]
80
+ if dated < fromdate: continue
81
+ if startindex == None: startindex = i
82
+ if dated > todate: break
83
+ endindex = i
84
+ if None in [startindex, endindex]: return None
85
+ subdf = df[startindex:endindex+limit]
86
+ subdf.reset_index(drop=True, inplace=True)
87
+ return subdf
88
+
89
+ def db_select (conn, query: str='SELECT * FROM temp', values: list=[]) -> list:
90
+ cursor = conn.cursor()
91
+ cursor.execute(query, values)
92
+ rows = cursor.fetchall()
93
+ columns = [desc[0] for desc in cursor.description]
94
+ return [dict(zip(columns, row)) for row in rows]
95
+
96
+ def db_insert (conn, df: pd.DataFrame, pindex: int=0):
97
+ db_temp(conn)
98
+ limit = len(this.COLUMNS)
99
+ offset = pindex * limit
100
+ placeholders = ','.join(['?' for _ in this.COLUMNS])
101
+ query = f'INSERT INTO temp ({",".join(this.COLUMNS)}) VALUES ({placeholders})'
102
+ conn.execute(query, [df[x] for x in range(offset, offset + limit)])
103
+
104
+ def db_drop (conn):
105
+ query = f'DROP TABLE IF EXISTS temp'
106
+ conn.execute(query)
107
+
108
+ def db_temp (conn):
109
+ cols = []
110
+ for col in this.COLUMNS:
111
+ if this.COLUMNS.index(col) == 0:
112
+ cols.append(f'{col} text unique')
113
+ else:
114
+ cols.append(f'{col} text')
115
+ query = f'CREATE TABLE IF NOT EXISTS temp ({",".join(cols)})'
116
+ conn.execute(query)
117
+
118
+ def db_parquet (conn, parfile: str, parloc: str=None, tolist: bool=False, dfraw: bool=False):
119
+ print('Loading data archive..')
120
+ parquet_url = parloc or this.STORAGE + parfile
121
+ cursor = conn.execute(f"SELECT * FROM read_parquet('{parquet_url}')")
122
+ print('Data archive loaded')
123
+ if tolist: return cursor.fetchall()
124
+ df = cursor.fetchdf()
125
+ if dfraw: return df
126
+ return df[df.columns[0]]
127
+
128
+ def db_connect (source: str=':memory:') -> duckdb.DuckDBPyConnection:
129
+ conn = duckdb.connect(source)
130
+ conn.execute('INSTALL httpfs')
131
+ conn.execute('LOAD httpfs')
132
+ return conn
133
+
134
+ def arrange_dates (fromdate: str, todate: str, parloc: str=None) -> tuple:
135
+ fromdate = parse_date(fromdate)
136
+ todate = parse_date(todate)
137
+ if fromdate > todate:
138
+ print('Invalid date range')
139
+ return None, None, None
140
+ ranges = date_ranges(parloc)
141
+ for range in ranges:
142
+ if todate <= range[1]:
143
+ if fromdate > range[0]:
144
+ return fromdate, todate, f'{range[1]}.parquet'
145
+ else:
146
+ print('Invalid starting date, using default')
147
+ return range[0], todate, f'{range[1]}.parquet'
148
+ print('Invalid ending date, using default')
149
+ return ranges[-1][0], ranges[-1][1], f'{ranges[-1][1]}.parquet'
150
+
151
+ def date_ranges (parloc: str=None) -> list:
152
+ dates = list_parquets(parloc)
153
+ ranges = []
154
+ old = '2008-05-20'
155
+ for date in dates:
156
+ ranges.append((old, date))
157
+ old = date
158
+ return ranges
159
+
160
+ def list_parquets (parloc: str=None) -> list:
161
+ try:
162
+ response = requests.get(parloc or this.PARQUETS)
163
+ return response.text.splitlines()
164
+ except:
165
+ return []
166
+
167
+ def load_git_parquet (parfile: str, parloc: str=None) -> pd.DataFrame:
168
+ return load_parquet(parloc or this.STORAGE + parfile)
169
+
170
+ def dig_parquet (parquet: str, column: str, page: int=1):
171
+ df = load_parquet(parquet)
172
+ return dig_poo_data(df, column, page)
173
+
174
+ def dig_poo_data (df: pd.DataFrame, column: str, page: int=1):
175
+ try:
176
+ return df[len(this.COLUMNS) * (page-1) + this.COLUMNS.index(column)]
177
+ except:
178
+ return None
179
+
180
+ def load_parquet (parquet: str) -> pd.DataFrame:
181
+ print('Loading remote parquet archive..')
182
+ df = pd.read_parquet(parquet) ## Path/URL
183
+ print('Parquet archive loaded')
184
+ return df[df.columns[0]]
185
+
186
+ def parquet_files (files: list, name: str=None, source: str=None, target: str=None):
187
+ name = name or files[-1].split('.')[0]
188
+ source = source or folder_source()[0]
189
+ target = target or folder_target()[0]
190
+ files = pack_files(files, source)
191
+ files = files.to_frame(name=name)
192
+ files.to_parquet(f'{target}/{name}.parquet', index=False, compression='gzip')
193
+ print('Done')
194
+
195
+ def pack_files (files: list, filedir: str) -> pd.DataFrame:
196
+ dfs = []
197
+ for file in files:
198
+ print('Process', file)
199
+ dfs.append(pack_file(file, filedir))
200
+ return pd.concat(dfs) ## ignore_index=False
201
+
202
+ def pack_file (filename: str, filedir: str) -> pd.DataFrame:
203
+ return pd.DataFrame(load_file(filename, filedir)).data
204
+
205
+ def save_file (content: str, output: str, usesoup: bool=True):
206
+ if usesoup:
207
+ soup = BeautifulSoup(content, 'html.parser')
208
+ html = soup.prettify(encoding='utf-8')
209
+ else:
210
+ html = content
211
+ output = Path(output)
212
+ output.parent.mkdir(parents=True, exist_ok=True)
213
+ with open(output, 'wb') as file: file.write(html)
214
+ print('File saved')
215
+
216
+ def load_file (filename: str, filedir: str, full: bool=False) -> dict:
217
+ soup = read_file(filename, filedir)
218
+ if not soup: return {'data': {}, 'html': None}
219
+ if full: return {'data': {}, 'html': str(soup)}
220
+ data = soup.find('body')
221
+ load = {'data': {}, 'html': None}
222
+ load['data'][this.COLUMNS[0]] = filename
223
+ load['data'][this.COLUMNS[1]] = data.find('a', {'id': 'bloggerEntryURL'})['href']
224
+ load['data'][this.COLUMNS[2]] = data.find('h2', {'id': 'bloggerEntryTitle'}).get_text()
225
+ load['data'][this.COLUMNS[3]] = data.find('h3', {'id': 'bloggerEntryAuthor'}).get_text()
226
+ load['data'][this.COLUMNS[4]] = data.find('span', {'id': 'bloggerEntryLabel'}).get_text()
227
+ load['data'][this.COLUMNS[5]] = data.find('span', {'id': 'bloggerEntryGMTDate'}).get_text()
228
+ load['data'][this.COLUMNS[6]] = str(data.find('span', {'id': 'bloggerEntryContent'}))
229
+ load['data'][this.COLUMNS[7]] = data.find('span', {'id': 'bloggerEntryTotalComments'}).get_text()
230
+ load['data'][this.COLUMNS[8]] = data.find('span', {'id': 'bloggerTotalCommentPages'}).get_text()
231
+ load['data'][this.COLUMNS[9]] = data.find('span', {'id': 'bloggerCommentPageNo'}).get_text()
232
+ load['data'][this.COLUMNS[10]] = str(data.find('span', {'id': 'bloggerEntryComment'}))
233
+ return load
234
+
235
+ def read_file (filename: str, filedir: str) -> BeautifulSoup:
236
+ try:
237
+ with open(f'{filedir}/{filename}', 'r', encoding='utf-8') as file:
238
+ page_cont = file.read()
239
+ return BeautifulSoup(page_cont, 'html.parser')
240
+ except FileNotFoundError:
241
+ print('No file found')
242
+ return None
243
+
244
+ def read_list (filelist: str='list.txt') -> list:
245
+ try:
246
+ with open(filelist, 'r') as file:
247
+ list_cont = file.read()
248
+ page_list = list_cont.split('\n')
249
+ return list(filter(None, page_list))
250
+ except FileNotFoundError:
251
+ print('No list available')
252
+ return []
253
+
254
+ def folder_target (path: str='parquet.target.txt') -> list:
255
+ return read_list(path)
256
+
257
+ def folder_source (path: str='parquet.source.txt') -> list:
258
+ return read_list(path)
259
+
260
+ def render_template (temp_var: str, data: dict={}) -> str:
261
+ if temp_var not in this.TEMPLATES: return None
262
+ template = jinja2.Template(this.TEMPLATES[temp_var])
263
+ return template.render(**data)
264
+
265
+ def load_templates (cdn_url: str=None) -> dict:
266
+ js_code = read_cdn(cdn_url)
267
+ this.TEMPLATES['HTML_FORM'] = extract_var(js_code, 'HTML_FORM')
268
+ this.TEMPLATES['POST_FORM'] = extract_var(js_code, 'POST_FORM')
269
+ this.TEMPLATES['COMMENT_FORM'] = extract_var(js_code, 'COMMENT_FORM')
270
+ this.TEMPLATES['HTML_STYLE'] = extract_var(js_code, 'HTML_STYLE')
271
+ return this.TEMPLATES
272
+
273
+ def extract_cdn (text_var: str, cdn_url: str=None) -> str:
274
+ return extract_var(read_cdn(cdn_url), text_var)
275
+
276
+ def extract_var (js_code: str, text_var: str) -> str:
277
+ if not js_code: return None
278
+ res = re.search(text_var + r'\s=\s(`|\')(.*?)(`|\')', js_code, re.DOTALL)
279
+ if not res: return None
280
+ return res[2]
281
+
282
+ def read_cdn (cdn_url: str=None) -> str:
283
+ try:
284
+ response = requests.get(cdn_url or this.JSCDN)
285
+ return response.text
286
+ except:
287
+ return None
288
+
289
+ COLUMNS = [
290
+ 'file',
291
+ 'entry',
292
+ 'title',
293
+ 'author',
294
+ 'label',
295
+ 'stamp',
296
+ 'content',
297
+ 'total',
298
+ 'pages',
299
+ 'pageno',
300
+ 'comments'
301
+ ]
302
+
303
+ PARQUETS = 'https://raw.githubusercontent.com/asinerum/poodigger/refs/heads/main/data/parquets.txt'
304
+ STORAGE = 'https://github.com/asinerum/poodigger/raw/refs/heads/main/data/'
305
+ JSCDN = 'https://cdn.jsdelivr.net/gh/asinerum/bloggerize/src/htmls.js'
306
+ TEMPLATES = {}
307
+
308
+ this = sys.modules[__name__]
309
+
310
+ def reconfig (**kwargs):
311
+ for key, value in kwargs.items():
312
+ setattr(this, key, value)
@@ -0,0 +1,25 @@
1
+ Metadata-Version: 2.4
2
+ Name: poodigger
3
+ Version: 1.0.0
4
+ Summary: Blogger Comment Digging Toolkit
5
+ Home-page: https://github.com/asinerum/poodigger
6
+ Author: Asinerum Conlang Project
7
+ Author-email: asinerum.com@gmail.com
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.7
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: bs4>=0.0.2
16
+ Requires-Dist: duckdb>=1.4.2
17
+ Requires-Dist: formatize>=1.0.3
18
+ Requires-Dist: jinja2>=3.1.6
19
+ Requires-Dist: pandas>=2.3.3
20
+ Dynamic: license-file
21
+
22
+ Detailed tips, tricks, and examples, can be found at project's repository
23
+ https://github.com/asinerum/poodigger
24
+
25
+ (C) 2026 Asinerum Conlang Project
@@ -0,0 +1,11 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ setup.cfg
5
+ src/poodigger/__init__.py
6
+ src/poodigger/parquet.py
7
+ src/poodigger.egg-info/PKG-INFO
8
+ src/poodigger.egg-info/SOURCES.txt
9
+ src/poodigger.egg-info/dependency_links.txt
10
+ src/poodigger.egg-info/requires.txt
11
+ src/poodigger.egg-info/top_level.txt
@@ -0,0 +1,5 @@
1
+ bs4>=0.0.2
2
+ duckdb>=1.4.2
3
+ formatize>=1.0.3
4
+ jinja2>=3.1.6
5
+ pandas>=2.3.3
@@ -0,0 +1 @@
1
+ poodigger