imdb-sqlite 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imdb_sqlite/__main__.py +25 -8
- {imdb_sqlite-1.1.0.dist-info → imdb_sqlite-1.2.0.dist-info}/METADATA +103 -48
- imdb_sqlite-1.2.0.dist-info/RECORD +8 -0
- {imdb_sqlite-1.1.0.dist-info → imdb_sqlite-1.2.0.dist-info}/WHEEL +1 -1
- imdb_sqlite-1.1.0.dist-info/RECORD +0 -8
- {imdb_sqlite-1.1.0.dist-info → imdb_sqlite-1.2.0.dist-info}/entry_points.txt +0 -0
- {imdb_sqlite-1.1.0.dist-info → imdb_sqlite-1.2.0.dist-info/licenses}/LICENSE +0 -0
- {imdb_sqlite-1.1.0.dist-info → imdb_sqlite-1.2.0.dist-info}/top_level.txt +0 -0
imdb_sqlite/__main__.py
CHANGED
|
@@ -109,7 +109,8 @@ TSV_TABLE_MAP = OrderedDict([
|
|
|
109
109
|
class Database:
|
|
110
110
|
""" Shallow DB abstraction """
|
|
111
111
|
|
|
112
|
-
def __init__(self, uri=':memory:'):
|
|
112
|
+
def __init__(self, table_map, uri=':memory:'):
|
|
113
|
+
self.table_map = table_map
|
|
113
114
|
exists = os.path.exists(uri)
|
|
114
115
|
self.connection = sqlite3.connect(uri, isolation_level=None)
|
|
115
116
|
self.connection.executescript("""
|
|
@@ -128,14 +129,14 @@ class Database:
|
|
|
128
129
|
|
|
129
130
|
def create_tables(self):
|
|
130
131
|
sqls = [self._create_table_sql(table, mapping.values())
|
|
131
|
-
for table, mapping in
|
|
132
|
+
for table, mapping in self.table_map.values()]
|
|
132
133
|
sql = '\n'.join(sqls)
|
|
133
134
|
logger.debug(sql)
|
|
134
135
|
self.connection.executescript(sql)
|
|
135
136
|
|
|
136
137
|
def create_indices(self):
|
|
137
138
|
sqls = [self._create_index_sql(table, mapping.values())
|
|
138
|
-
for table, mapping in
|
|
139
|
+
for table, mapping in self.table_map.values()]
|
|
139
140
|
sql = '\n'.join([s for s in sqls if s])
|
|
140
141
|
logger.debug(sql)
|
|
141
142
|
for stmt in tqdm(sql.split('\n'), unit='index'):
|
|
@@ -286,11 +287,22 @@ def import_file(db, filename, table, column_mapping):
|
|
|
286
287
|
raise
|
|
287
288
|
|
|
288
289
|
|
|
290
|
+
def filter_table_subset(table_map, wanted_tables):
|
|
291
|
+
def split_csv(s): return [v for v in (v.strip() for v in s.split(',')) if v]
|
|
292
|
+
|
|
293
|
+
wanted_tables = split_csv(wanted_tables)
|
|
294
|
+
out = OrderedDict()
|
|
295
|
+
for filename, (table_name, table_spec) in table_map.items():
|
|
296
|
+
if table_name in wanted_tables:
|
|
297
|
+
out[filename] = (table_name, table_spec)
|
|
298
|
+
return out
|
|
299
|
+
|
|
300
|
+
|
|
289
301
|
def main():
|
|
290
302
|
parser = argparse.ArgumentParser(
|
|
291
303
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
292
|
-
description='Imports imdb tsv interface files into a new sqlite'
|
|
293
|
-
'database. Fetches them from imdb if not present on'
|
|
304
|
+
description='Imports imdb tsv interface files into a new sqlite '
|
|
305
|
+
'database. Fetches them from imdb if not present on '
|
|
294
306
|
'the machine.'
|
|
295
307
|
)
|
|
296
308
|
parser.add_argument('--db', metavar='FILE', default='imdb.db',
|
|
@@ -300,6 +312,9 @@ def main():
|
|
|
300
312
|
parser.add_argument('--no-index', action='store_true',
|
|
301
313
|
help='Do not create any indices. Massively slower joins, but cuts the DB file size '
|
|
302
314
|
'approximately in half')
|
|
315
|
+
parser.add_argument('--only', metavar='TABLES',
|
|
316
|
+
help='Import only a some tables. The tables to import are specified using a comma delimited '
|
|
317
|
+
'list, such as "people,titles". Use it to save storage space.')
|
|
303
318
|
parser.add_argument('--verbose', action='store_true',
|
|
304
319
|
help='Show database interaction')
|
|
305
320
|
opts = parser.parse_args()
|
|
@@ -312,11 +327,13 @@ def main():
|
|
|
312
327
|
logger.warning('DB already exists: ({db}). Refusing to modify. Exiting'.format(db=opts.db))
|
|
313
328
|
return 1
|
|
314
329
|
|
|
315
|
-
|
|
330
|
+
table_map = filter_table_subset(TSV_TABLE_MAP, opts.only) if opts.only else TSV_TABLE_MAP
|
|
331
|
+
|
|
332
|
+
ensure_downloaded(table_map.keys(), opts.cache_dir)
|
|
316
333
|
logger.info('Populating database: {}'.format(opts.db))
|
|
317
|
-
db = Database(uri=opts.db)
|
|
334
|
+
db = Database(table_map=table_map, uri=opts.db)
|
|
318
335
|
|
|
319
|
-
for filename, table_mapping in
|
|
336
|
+
for filename, table_mapping in table_map.items():
|
|
320
337
|
table, column_mapping = table_mapping
|
|
321
338
|
import_file(db, os.path.join(opts.cache_dir, filename),
|
|
322
339
|
table, column_mapping)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: imdb-sqlite
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: Imports IMDB TSV files into a SQLite database
|
|
5
5
|
Home-page: https://github.com/jojje/imdb-sqlite
|
|
6
6
|
Author: Jonas Tingeborn
|
|
@@ -12,7 +12,17 @@ Classifier: License :: OSI Approved :: GNU General Public License v2 (GPLv2)
|
|
|
12
12
|
Classifier: Operating System :: OS Independent
|
|
13
13
|
Description-Content-Type: text/markdown
|
|
14
14
|
License-File: LICENSE
|
|
15
|
-
Requires-Dist: tqdm
|
|
15
|
+
Requires-Dist: tqdm>=4.4.1
|
|
16
|
+
Dynamic: author
|
|
17
|
+
Dynamic: author-email
|
|
18
|
+
Dynamic: classifier
|
|
19
|
+
Dynamic: description
|
|
20
|
+
Dynamic: description-content-type
|
|
21
|
+
Dynamic: home-page
|
|
22
|
+
Dynamic: license
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
Dynamic: requires-dist
|
|
25
|
+
Dynamic: summary
|
|
16
26
|
|
|
17
27
|
# imdb-sqlite
|
|
18
28
|
Imports IMDB TSV files into a SQLite database.
|
|
@@ -36,8 +46,8 @@ The program relies on the following IMDB tab separated files:
|
|
|
36
46
|
|
|
37
47
|
usage: imdb-sqlite [OPTIONS]
|
|
38
48
|
|
|
39
|
-
Imports imdb tsv interface files into a new
|
|
40
|
-
|
|
49
|
+
Imports imdb tsv interface files into a new sqlit database. Fetches them from imdb
|
|
50
|
+
if not present on the machine.
|
|
41
51
|
|
|
42
52
|
optional arguments:
|
|
43
53
|
-h, --help show this help message and exit
|
|
@@ -45,15 +55,19 @@ The program relies on the following IMDB tab separated files:
|
|
|
45
55
|
--cache-dir DIR Download cache dir where the tsv files from imdb will be stored
|
|
46
56
|
before the import (default: downloads)
|
|
47
57
|
--no-index Do not create any indices. Massively slower joins, but cuts the DB
|
|
48
|
-
file size approximately in half
|
|
49
|
-
--
|
|
58
|
+
file size approximately in half
|
|
59
|
+
--only TABLES Import only some tables. The tables to import are specified using
|
|
60
|
+
a comma delimited list, such as "people,titles". Use it to save
|
|
61
|
+
storage space.
|
|
62
|
+
--verbose Show database interaction
|
|
50
63
|
|
|
51
64
|
Just run the program with no arguments, and you'll get a file named `imdb.db`
|
|
52
65
|
in the current working directory.
|
|
53
66
|
|
|
54
67
|
### Hints
|
|
55
68
|
* Make sure the disk the database is written to has sufficient space.
|
|
56
|
-
About
|
|
69
|
+
About 19 GiB is needed as of early 2026. About 9.5 GB without indices.
|
|
70
|
+
(for even less storage requirement, see Disk space tips below).
|
|
57
71
|
* Use a SSD to speed up the import.
|
|
58
72
|
* To check the best case import performance, use an in-memory database:
|
|
59
73
|
`--db :memory:`.
|
|
@@ -61,50 +75,53 @@ in the current working directory.
|
|
|
61
75
|
## Example
|
|
62
76
|
|
|
63
77
|
$ imdb-sqlite
|
|
78
|
+
|
|
79
|
+
2026-02-04 16:30:31,311 Populating database: imdb.db
|
|
80
|
+
2026-02-04 16:30:31,319 Applying schema
|
|
81
|
+
|
|
82
|
+
2026-02-04 16:30:31,323 Importing file: downloads\name.basics.tsv.gz
|
|
83
|
+
2026-02-04 16:30:31,324 Reading number of rows ...
|
|
84
|
+
2026-02-04 16:30:34,373 Inserting rows into table: people
|
|
85
|
+
100%|██████████████████| 15063390/15063390 [01:05<00:00, 228659.33 rows/s]
|
|
64
86
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
100
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
2018-07-08 16:08:08,029 Inserting rows into table: ratings
|
|
96
|
-
100%|███████████████████████████| 846901/846901 [00:05<00:00, 152421.27 rows/s]
|
|
97
|
-
|
|
98
|
-
2018-07-08 16:08:13,589 Creating table indices ...
|
|
99
|
-
2018-07-08 16:09:16,451 Import successful
|
|
87
|
+
2026-02-04 16:31:40,262 Importing file: downloads\title.basics.tsv.gz
|
|
88
|
+
2026-02-04 16:31:40,262 Reading number of rows ...
|
|
89
|
+
2026-02-04 16:31:42,777 Inserting rows into table: titles
|
|
90
|
+
100%|██████████████████| 12265715/12265715 [01:06<00:00, 185564.42 rows/s]
|
|
91
|
+
|
|
92
|
+
2026-02-04 16:32:48,879 Importing file: downloads\title.akas.tsv.gz
|
|
93
|
+
2026-02-04 16:32:48,880 Reading number of rows ...
|
|
94
|
+
2026-02-04 16:32:54,646 Inserting rows into table: akas
|
|
95
|
+
100%|██████████████████| 54957563/54957563 [04:06<00:00, 222556.12 rows/s]
|
|
96
|
+
|
|
97
|
+
2026-02-04 16:37:01,586 Importing file: downloads\title.principals.tsv.gz
|
|
98
|
+
2026-02-04 16:37:01,587 Reading number of rows ...
|
|
99
|
+
2026-02-04 16:37:11,294 Inserting rows into table: crew
|
|
100
|
+
100%|██████████████████| 97617046/97617046 [06:27<00:00, 251790.20 rows/s]
|
|
101
|
+
|
|
102
|
+
2026-02-04 16:43:38,990 Importing file: downloads\title.episode.tsv.gz
|
|
103
|
+
2026-02-04 16:43:38,990 Reading number of rows ...
|
|
104
|
+
2026-02-04 16:43:39,635 Inserting rows into table: episodes
|
|
105
|
+
100%|████████████████████| 9462887/9462887 [00:29<00:00, 315650.53 rows/s]
|
|
106
|
+
|
|
107
|
+
2026-02-04 16:44:09,618 Importing file: downloads\title.ratings.tsv.gz
|
|
108
|
+
2026-02-04 16:44:09,618 Reading number of rows ...
|
|
109
|
+
2026-02-04 16:44:09,706 Inserting rows into table: ratings
|
|
110
|
+
100%|████████████████████| 1631810/1631810 [00:05<00:00, 304073.42 rows/s]
|
|
111
|
+
|
|
112
|
+
2026-02-04 16:44:15,077 Creating table indices ...
|
|
113
|
+
100%|██████████████████████████████████| 12/12 [03:19<00:00, 16.64s/index]
|
|
114
|
+
|
|
115
|
+
2026-02-04 16:47:34,781 Analyzing DB to generate statistic for query planner ...
|
|
116
|
+
2026-02-04 16:48:01,367 Import successful
|
|
100
117
|
|
|
101
118
|
|
|
102
119
|
### Note
|
|
103
120
|
The import may take a long time, since there are millions of records to
|
|
104
121
|
process.
|
|
105
122
|
|
|
106
|
-
The above example used python 3.
|
|
107
|
-
being on a SSD.
|
|
123
|
+
The above example used python 3.10.13 on windows 10, with the working directory
|
|
124
|
+
being on a fast Kingston NVME SSD.
|
|
108
125
|
|
|
109
126
|
## Data model
|
|
110
127
|
|
|
@@ -118,7 +135,8 @@ reference it is in order.
|
|
|
118
135
|
|
|
119
136
|
A movie has a title, a TV show has one. An episode has one as well. Well two
|
|
120
137
|
actually; the title of the show, and the title of the episode itself. That is
|
|
121
|
-
why there are two links to the same `title_id` attribute in the `titles` table
|
|
138
|
+
why there are two links to the same `title_id` attribute in the `titles` table,
|
|
139
|
+
from the `episodes` table.
|
|
122
140
|
|
|
123
141
|
To make the relationships a bit clearer, following are a few query examples
|
|
124
142
|
|
|
@@ -188,7 +206,7 @@ massive query speedup.
|
|
|
188
206
|
For example `sqlite3 imdb.db "CREATE INDEX myindex ON <table-name> (<slow-column>)"`
|
|
189
207
|
|
|
190
208
|
### Disk space tips
|
|
191
|
-
The imported data as of
|
|
209
|
+
The imported data as of 2026 produces a database file that is about 19 GiB.
|
|
192
210
|
About half of that space is for indices used to speed up query lookups and
|
|
193
211
|
joins. The default indices take up about as much as the data.
|
|
194
212
|
|
|
@@ -197,7 +215,7 @@ ETL-step, for refreshing the dataset every now and then, and then simply export
|
|
|
197
215
|
the full tables (e.g. for data science using pandas/ML), a `--no-index` flag is
|
|
198
216
|
available. When specifying this flag, no indices will be created, which not
|
|
199
217
|
only saves about 50% disk space, but also speeds up the overall import process.
|
|
200
|
-
When this flag is provided, the DB file will be just
|
|
218
|
+
When this flag is provided, the DB file will be just 9.5 GiB as of date of
|
|
201
219
|
writing.
|
|
202
220
|
|
|
203
221
|
If you know precisely which indices you need, omitting the default indices may
|
|
@@ -205,6 +223,43 @@ also be a good idea, since you'd then not waste disk space on indices you don't
|
|
|
205
223
|
need. Simply create the indices you _do_ need manually, as illustrated in the
|
|
206
224
|
performance tip above.
|
|
207
225
|
|
|
226
|
+
As an indicator, following is the current space consumption spread across the tables.
|
|
227
|
+
|
|
228
|
+
Full import
|
|
229
|
+
|
|
230
|
+
* default (includes indices): 19 GB
|
|
231
|
+
* without indices: 9.5 GB
|
|
232
|
+
|
|
233
|
+
Sizes of the respective tables when doing selective import of only a single
|
|
234
|
+
table without indices.
|
|
235
|
+
|
|
236
|
+
```
|
|
237
|
+
* crew: 46% (4.4 GB)
|
|
238
|
+
* akas: 28% (2.7 GB)
|
|
239
|
+
* titles: 14% (1.3 GB)
|
|
240
|
+
* people: 8% (0.8 GB)
|
|
241
|
+
* episodes: 3% (0.3 GB)
|
|
242
|
+
* ratings: 1% (0.1 GB)
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
Percentages are the relative space consumption of the full index-free import
|
|
246
|
+
(~9.5 GB).
|
|
247
|
+
|
|
248
|
+
Fair to say, "who played what character", or "fetched a doughnut to what
|
|
249
|
+
VIP-of-wasting-space" accounts for about half the storage. If you can live
|
|
250
|
+
without those details then there's a massive storage saving to be made. Also, if
|
|
251
|
+
you don't need all the aliases for all the titles, like the portuguese title of
|
|
252
|
+
some bollywood flick, then the akas can also be skipped. Getting rid of those
|
|
253
|
+
two tables shaves off 3/4 of the required space. That's significant.
|
|
254
|
+
|
|
255
|
+
If you don't care about characters, and just want to query moves or shows, their
|
|
256
|
+
ratings and perhaps per-episode ratings as well, then 2 GiB of storage suffices
|
|
257
|
+
as you only need tables titles, episodes and ratings. However if you actually
|
|
258
|
+
want to query those tables as well, then you'd want to create indices, either
|
|
259
|
+
manually or use the default. This ups the space requirement about 50% (3GB).
|
|
260
|
+
I.e. just provide the command line argument `--only titles,ratings,episodes`.
|
|
261
|
+
|
|
262
|
+
|
|
208
263
|
## PyPI
|
|
209
264
|
Current status of the project is:
|
|
210
265
|
[](https://github.com/jojje/imdb-sqlite/actions/workflows/python-publish.yml)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
imdb_sqlite/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
imdb_sqlite/__main__.py,sha256=w_xmBuRxj6-Cg31ITwnkW5QYWaco3E-3HYoiDZ_AZYk,13121
|
|
3
|
+
imdb_sqlite-1.2.0.dist-info/licenses/LICENSE,sha256=gXf5dRMhNSbfLPYYTY_5hsZ1r7UU1OaKQEAQUhuIBkM,18092
|
|
4
|
+
imdb_sqlite-1.2.0.dist-info/METADATA,sha256=J3N9Z7f7K_LYNcDLAPogrMOGiKkXO6Z222QWi15v1EM,12011
|
|
5
|
+
imdb_sqlite-1.2.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
6
|
+
imdb_sqlite-1.2.0.dist-info/entry_points.txt,sha256=0qU1qhre6z6V8Q_q0QxnKvmx2c8BVjhPGqIg9-Inpcc,58
|
|
7
|
+
imdb_sqlite-1.2.0.dist-info/top_level.txt,sha256=mPPacZxoJziQ2beMOavSJ9Yyg_dhGmOkNN5gP_boDSY,12
|
|
8
|
+
imdb_sqlite-1.2.0.dist-info/RECORD,,
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
imdb_sqlite/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
imdb_sqlite/__main__.py,sha256=U7XxeHdV1up2X2VOGKPYPFqAFL-majAglYVWyarsnas,12323
|
|
3
|
-
imdb_sqlite-1.1.0.dist-info/LICENSE,sha256=gXf5dRMhNSbfLPYYTY_5hsZ1r7UU1OaKQEAQUhuIBkM,18092
|
|
4
|
-
imdb_sqlite-1.1.0.dist-info/METADATA,sha256=R6P2TzoZLZSneH2Xe6faC6lLsrcdirmfagP1tyCoAwU,9927
|
|
5
|
-
imdb_sqlite-1.1.0.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
|
|
6
|
-
imdb_sqlite-1.1.0.dist-info/entry_points.txt,sha256=0qU1qhre6z6V8Q_q0QxnKvmx2c8BVjhPGqIg9-Inpcc,58
|
|
7
|
-
imdb_sqlite-1.1.0.dist-info/top_level.txt,sha256=mPPacZxoJziQ2beMOavSJ9Yyg_dhGmOkNN5gP_boDSY,12
|
|
8
|
-
imdb_sqlite-1.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|