imdb-sqlite 1.0.1__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
imdb_sqlite/__main__.py CHANGED
@@ -109,7 +109,8 @@ TSV_TABLE_MAP = OrderedDict([
109
109
  class Database:
110
110
  """ Shallow DB abstraction """
111
111
 
112
- def __init__(self, uri=':memory:'):
112
+ def __init__(self, table_map, uri=':memory:'):
113
+ self.table_map = table_map
113
114
  exists = os.path.exists(uri)
114
115
  self.connection = sqlite3.connect(uri, isolation_level=None)
115
116
  self.connection.executescript("""
@@ -128,14 +129,14 @@ class Database:
128
129
 
129
130
  def create_tables(self):
130
131
  sqls = [self._create_table_sql(table, mapping.values())
131
- for table, mapping in TSV_TABLE_MAP.values()]
132
+ for table, mapping in self.table_map.values()]
132
133
  sql = '\n'.join(sqls)
133
134
  logger.debug(sql)
134
135
  self.connection.executescript(sql)
135
136
 
136
137
  def create_indices(self):
137
138
  sqls = [self._create_index_sql(table, mapping.values())
138
- for table, mapping in TSV_TABLE_MAP.values()]
139
+ for table, mapping in self.table_map.values()]
139
140
  sql = '\n'.join([s for s in sqls if s])
140
141
  logger.debug(sql)
141
142
  for stmt in tqdm(sql.split('\n'), unit='index'):
@@ -286,17 +287,34 @@ def import_file(db, filename, table, column_mapping):
286
287
  raise
287
288
 
288
289
 
290
+ def filter_table_subset(table_map, wanted_tables):
291
+ def split_csv(s): return [v for v in (v.strip() for v in s.split(',')) if v]
292
+
293
+ wanted_tables = split_csv(wanted_tables)
294
+ out = OrderedDict()
295
+ for filename, (table_name, table_spec) in table_map.items():
296
+ if table_name in wanted_tables:
297
+ out[filename] = (table_name, table_spec)
298
+ return out
299
+
300
+
289
301
  def main():
290
302
  parser = argparse.ArgumentParser(
291
303
  formatter_class=argparse.ArgumentDefaultsHelpFormatter,
292
- description='Imports imdb tsv interface files into a new sqlite'
293
- 'database. Fetches them from imdb if not present on'
304
+ description='Imports imdb tsv interface files into a new sqlite '
305
+ 'database. Fetches them from imdb if not present on '
294
306
  'the machine.'
295
307
  )
296
308
  parser.add_argument('--db', metavar='FILE', default='imdb.db',
297
- help='Connection URI for the database to import into.')
309
+ help='Connection URI for the database to import into')
298
310
  parser.add_argument('--cache-dir', metavar='DIR', default='downloads',
299
- help='Download cache dir where the tsv files from imdb will be stored before the import.')
311
+ help='Download cache dir where the tsv files from imdb will be stored before the import')
312
+ parser.add_argument('--no-index', action='store_true',
313
+ help='Do not create any indices. Massively slower joins, but cuts the DB file size '
314
+ 'approximately in half')
315
+ parser.add_argument('--only', metavar='TABLES',
316
+ help='Import only a some tables. The tables to import are specified using a comma delimited '
317
+ 'list, such as "people,titles". Use it to save storage space.')
300
318
  parser.add_argument('--verbose', action='store_true',
301
319
  help='Show database interaction')
302
320
  opts = parser.parse_args()
@@ -309,17 +327,20 @@ def main():
309
327
  logger.warning('DB already exists: ({db}). Refusing to modify. Exiting'.format(db=opts.db))
310
328
  return 1
311
329
 
312
- ensure_downloaded(TSV_TABLE_MAP.keys(), opts.cache_dir)
330
+ table_map = filter_table_subset(TSV_TABLE_MAP, opts.only) if opts.only else TSV_TABLE_MAP
331
+
332
+ ensure_downloaded(table_map.keys(), opts.cache_dir)
313
333
  logger.info('Populating database: {}'.format(opts.db))
314
- db = Database(uri=opts.db)
334
+ db = Database(table_map=table_map, uri=opts.db)
315
335
 
316
- for filename, table_mapping in TSV_TABLE_MAP.items():
336
+ for filename, table_mapping in table_map.items():
317
337
  table, column_mapping = table_mapping
318
338
  import_file(db, os.path.join(opts.cache_dir, filename),
319
339
  table, column_mapping)
320
340
 
321
- logger.info('Creating table indices ...')
322
- db.create_indices()
341
+ if not opts.no_index:
342
+ logger.info('Creating table indices ...')
343
+ db.create_indices()
323
344
 
324
345
  logger.info('Analyzing DB to generate statistic for query planner ...')
325
346
  db.analyze()
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: imdb-sqlite
3
- Version: 1.0.1
3
+ Version: 1.2.0
4
4
  Summary: Imports IMDB TSV files into a SQLite database
5
5
  Home-page: https://github.com/jojje/imdb-sqlite
6
6
  Author: Jonas Tingeborn
@@ -12,7 +12,17 @@ Classifier: License :: OSI Approved :: GNU General Public License v2 (GPLv2)
12
12
  Classifier: Operating System :: OS Independent
13
13
  Description-Content-Type: text/markdown
14
14
  License-File: LICENSE
15
- Requires-Dist: tqdm (>=4.4.1)
15
+ Requires-Dist: tqdm>=4.4.1
16
+ Dynamic: author
17
+ Dynamic: author-email
18
+ Dynamic: classifier
19
+ Dynamic: description
20
+ Dynamic: description-content-type
21
+ Dynamic: home-page
22
+ Dynamic: license
23
+ Dynamic: license-file
24
+ Dynamic: requires-dist
25
+ Dynamic: summary
16
26
 
17
27
  # imdb-sqlite
18
28
  Imports IMDB TSV files into a SQLite database.
@@ -36,23 +46,28 @@ The program relies on the following IMDB tab separated files:
36
46
 
37
47
  usage: imdb-sqlite [OPTIONS]
38
48
 
39
- Imports imdb tsv interface files into a new sqlitedatabase. Fetches them from
40
- imdb if not present onthe machine.
49
+ Imports imdb tsv interface files into a new sqlit database. Fetches them from imdb
50
+ if not present on the machine.
41
51
 
42
52
  optional arguments:
43
53
  -h, --help show this help message and exit
44
- --db FILE Connection URI for the database to import into. (default:
45
- imdb.db)
46
- --cache-dir DIR Download cache dir where the tsv files from imdb will be
47
- stored before the import. (default: downloads)
48
- --verbose Show database interaction (default: False)
54
+ --db FILE Connection URI for the database to import into (default: imdb.db)
55
+ --cache-dir DIR Download cache dir where the tsv files from imdb will be stored
56
+ before the import (default: downloads)
57
+ --no-index Do not create any indices. Massively slower joins, but cuts the DB
58
+ file size approximately in half
59
+ --only TABLES Import only some tables. The tables to import are specified using
60
+ a comma delimited list, such as "people,titles". Use it to save
61
+ storage space.
62
+ --verbose Show database interaction
49
63
 
50
64
  Just run the program with no arguments, and you'll get a file named `imdb.db`
51
65
  in the current working directory.
52
66
 
53
67
  ### Hints
54
68
  * Make sure the disk the database is written to has sufficient space.
55
- About 5 GiB is needed.
69
+ About 19 GiB is needed as of early 2026. About 9.5 GB without indices.
70
+ (for even less storage requirement, see Disk space tips below).
56
71
  * Use a SSD to speed up the import.
57
72
  * To check the best case import performance, use an in-memory database:
58
73
  `--db :memory:`.
@@ -60,50 +75,53 @@ in the current working directory.
60
75
  ## Example
61
76
 
62
77
  $ imdb-sqlite
78
+
79
+ 2026-02-04 16:30:31,311 Populating database: imdb.db
80
+ 2026-02-04 16:30:31,319 Applying schema
81
+
82
+ 2026-02-04 16:30:31,323 Importing file: downloads\name.basics.tsv.gz
83
+ 2026-02-04 16:30:31,324 Reading number of rows ...
84
+ 2026-02-04 16:30:34,373 Inserting rows into table: people
85
+ 100%|██████████████████| 15063390/15063390 [01:05<00:00, 228659.33 rows/s]
63
86
 
64
- 2018-07-08 16:00:00,000 Populating database: imdb.db
65
- 2018-07-08 16:00:00,001 Applying schema
66
-
67
- 2018-07-08 16:00:00,005 Importing file: downloads\name.basics.tsv.gz
68
- 2018-07-08 16:00:00,005 Reading number of rows ...
69
- 2018-07-08 16:00:11,521 Inserting rows into table: people
70
- 100%|█████████████████████████| 8699964/8699964 [01:23<00:00, 104387.75 rows/s]
71
-
72
- 2018-07-08 16:01:34,868 Importing file: downloads\title.basics.tsv.gz
73
- 2018-07-08 16:01:34,868 Reading number of rows ...
74
- 2018-07-08 16:01:41,873 Inserting rows into table: titles
75
- 100%|██████████████████████████| 5110779/5110779 [00:58<00:00, 87686.98 rows/s]
76
-
77
- 2018-07-08 16:02:40,161 Importing file: downloads\title.akas.tsv.gz
78
- 2018-07-08 16:02:40,161 Reading number of rows ...
79
- 2018-07-08 16:02:44,743 Inserting rows into table: akas
80
- 100%|██████████████████████████| 3625334/3625334 [00:37<00:00, 97412.94 rows/s]
81
-
82
- 2018-07-08 16:03:21,964 Importing file: downloads\title.principals.tsv.gz
83
- 2018-07-08 16:03:21,964 Reading number of rows ...
84
- 2018-07-08 16:03:55,922 Inserting rows into table: crew
85
- 100%|███████████████████████| 28914893/28914893 [03:45<00:00, 128037.21 rows/s]
86
-
87
- 2018-07-08 16:07:41,757 Importing file: downloads\title.episode.tsv.gz
88
- 2018-07-08 16:07:41,757 Reading number of rows ...
89
- 2018-07-08 16:07:45,370 Inserting rows into table: episodes
90
- 100%|█████████████████████████| 3449903/3449903 [00:21<00:00, 158265.16 rows/s]
91
-
92
- 2018-07-08 16:08:07,172 Importing file: downloads\title.ratings.tsv.gz
93
- 2018-07-08 16:08:07,172 Reading number of rows ...
94
- 2018-07-08 16:08:08,029 Inserting rows into table: ratings
95
- 100%|███████████████████████████| 846901/846901 [00:05<00:00, 152421.27 rows/s]
96
-
97
- 2018-07-08 16:08:13,589 Creating table indices ...
98
- 2018-07-08 16:09:16,451 Import successful
87
+ 2026-02-04 16:31:40,262 Importing file: downloads\title.basics.tsv.gz
88
+ 2026-02-04 16:31:40,262 Reading number of rows ...
89
+ 2026-02-04 16:31:42,777 Inserting rows into table: titles
90
+ 100%|██████████████████| 12265715/12265715 [01:06<00:00, 185564.42 rows/s]
91
+
92
+ 2026-02-04 16:32:48,879 Importing file: downloads\title.akas.tsv.gz
93
+ 2026-02-04 16:32:48,880 Reading number of rows ...
94
+ 2026-02-04 16:32:54,646 Inserting rows into table: akas
95
+ 100%|██████████████████| 54957563/54957563 [04:06<00:00, 222556.12 rows/s]
96
+
97
+ 2026-02-04 16:37:01,586 Importing file: downloads\title.principals.tsv.gz
98
+ 2026-02-04 16:37:01,587 Reading number of rows ...
99
+ 2026-02-04 16:37:11,294 Inserting rows into table: crew
100
+ 100%|██████████████████| 97617046/97617046 [06:27<00:00, 251790.20 rows/s]
101
+
102
+ 2026-02-04 16:43:38,990 Importing file: downloads\title.episode.tsv.gz
103
+ 2026-02-04 16:43:38,990 Reading number of rows ...
104
+ 2026-02-04 16:43:39,635 Inserting rows into table: episodes
105
+ 100%|████████████████████| 9462887/9462887 [00:29<00:00, 315650.53 rows/s]
106
+
107
+ 2026-02-04 16:44:09,618 Importing file: downloads\title.ratings.tsv.gz
108
+ 2026-02-04 16:44:09,618 Reading number of rows ...
109
+ 2026-02-04 16:44:09,706 Inserting rows into table: ratings
110
+ 100%|████████████████████| 1631810/1631810 [00:05<00:00, 304073.42 rows/s]
111
+
112
+ 2026-02-04 16:44:15,077 Creating table indices ...
113
+ 100%|██████████████████████████████████| 12/12 [03:19<00:00, 16.64s/index]
114
+
115
+ 2026-02-04 16:47:34,781 Analyzing DB to generate statistic for query planner ...
116
+ 2026-02-04 16:48:01,367 Import successful
99
117
 
100
118
 
101
119
  ### Note
102
120
  The import may take a long time, since there are millions of records to
103
121
  process.
104
122
 
105
- The above example used python 3.6.4 on windows 7, with the working directory
106
- being on a SSD.
123
+ The above example used python 3.10.13 on windows 10, with the working directory
124
+ being on a fast Kingston NVME SSD.
107
125
 
108
126
  ## Data model
109
127
 
@@ -117,7 +135,8 @@ reference it is in order.
117
135
 
118
136
  A movie has a title, a TV show has one. An episode has one as well. Well two
119
137
  actually; the title of the show, and the title of the episode itself. That is
120
- why there are two links to the same `title_id` attribute in the `titles` table.
138
+ why there are two links to the same `title_id` attribute in the `titles` table,
139
+ from the `episodes` table.
121
140
 
122
141
  To make the relationships a bit clearer, following are a few query examples
123
142
 
@@ -186,6 +205,61 @@ massive query speedup.
186
205
 
187
206
  For example `sqlite3 imdb.db "CREATE INDEX myindex ON <table-name> (<slow-column>)"`
188
207
 
208
+ ### Disk space tips
209
+ The imported data as of 2026 produces a database file that is about 19 GiB.
210
+ About half of that space is for indices used to speed up query lookups and
211
+ joins. The default indices take up about as much as the data.
212
+
213
+ To cater for use cases where people just want to use the tool as part of some
214
+ ETL-step, for refreshing the dataset every now and then, and then simply export
215
+ the full tables (e.g. for data science using pandas/ML), a `--no-index` flag is
216
+ available. When specifying this flag, no indices will be created, which not
217
+ only saves about 50% disk space, but also speeds up the overall import process.
218
+ When this flag is provided, the DB file will be just 9.5 GiB as of date of
219
+ writing.
220
+
221
+ If you know precisely which indices you need, omitting the default indices may
222
+ also be a good idea, since you'd then not waste disk space on indices you don't
223
+ need. Simply create the indices you _do_ need manually, as illustrated in the
224
+ performance tip above.
225
+
226
+ As an indicator, following is the current space consumption spread across the tables.
227
+
228
+ Full import
229
+
230
+ * default (includes indices): 19 GB
231
+ * without indices: 9.5 GB
232
+
233
+ Sizes of the respective tables when doing selective import of only a single
234
+ table without indices.
235
+
236
+ ```
237
+ * crew: 46% (4.4 GB)
238
+ * akas: 28% (2.7 GB)
239
+ * titles: 14% (1.3 GB)
240
+ * people: 8% (0.8 GB)
241
+ * episodes: 3% (0.3 GB)
242
+ * ratings: 1% (0.1 GB)
243
+ ```
244
+
245
+ Percentages are the relative space consumption of the full index-free import
246
+ (~9.5 GB).
247
+
248
+ Fair to say, "who played what character", or "fetched a doughnut to what
249
+ VIP-of-wasting-space" accounts for about half the storage. If you can live
250
+ without those details then there's a massive storage saving to be made. Also, if
251
+ you don't need all the aliases for all the titles, like the portuguese title of
252
+ some bollywood flick, then the akas can also be skipped. Getting rid of those
253
+ two tables shaves off 3/4 of the required space. That's significant.
254
+
255
+ If you don't care about characters, and just want to query moves or shows, their
256
+ ratings and perhaps per-episode ratings as well, then 2 GiB of storage suffices
257
+ as you only need tables titles, episodes and ratings. However if you actually
258
+ want to query those tables as well, then you'd want to create indices, either
259
+ manually or use the default. This ups the space requirement about 50% (3GB).
260
+ I.e. just provide the command line argument `--only titles,ratings,episodes`.
261
+
262
+
189
263
  ## PyPI
190
264
  Current status of the project is:
191
265
  [![Build Status](https://github.com/jojje/imdb-sqlite/actions/workflows/python-publish.yml/badge.svg)](https://github.com/jojje/imdb-sqlite/actions/workflows/python-publish.yml)
@@ -0,0 +1,8 @@
1
+ imdb_sqlite/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ imdb_sqlite/__main__.py,sha256=w_xmBuRxj6-Cg31ITwnkW5QYWaco3E-3HYoiDZ_AZYk,13121
3
+ imdb_sqlite-1.2.0.dist-info/licenses/LICENSE,sha256=gXf5dRMhNSbfLPYYTY_5hsZ1r7UU1OaKQEAQUhuIBkM,18092
4
+ imdb_sqlite-1.2.0.dist-info/METADATA,sha256=J3N9Z7f7K_LYNcDLAPogrMOGiKkXO6Z222QWi15v1EM,12011
5
+ imdb_sqlite-1.2.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
6
+ imdb_sqlite-1.2.0.dist-info/entry_points.txt,sha256=0qU1qhre6z6V8Q_q0QxnKvmx2c8BVjhPGqIg9-Inpcc,58
7
+ imdb_sqlite-1.2.0.dist-info/top_level.txt,sha256=mPPacZxoJziQ2beMOavSJ9Yyg_dhGmOkNN5gP_boDSY,12
8
+ imdb_sqlite-1.2.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.40.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,8 +0,0 @@
1
- imdb_sqlite/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- imdb_sqlite/__main__.py,sha256=kh5txitlqOH1WJR9WjjcRILeV1LMV2SltoatU8eHB0g,12069
3
- imdb_sqlite-1.0.1.dist-info/LICENSE,sha256=gXf5dRMhNSbfLPYYTY_5hsZ1r7UU1OaKQEAQUhuIBkM,18092
4
- imdb_sqlite-1.0.1.dist-info/METADATA,sha256=aNYUTd9DHoSkzaWscP6Gqslm1vcKVqj9ON_UkE1pZ2U,8811
5
- imdb_sqlite-1.0.1.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
6
- imdb_sqlite-1.0.1.dist-info/entry_points.txt,sha256=0qU1qhre6z6V8Q_q0QxnKvmx2c8BVjhPGqIg9-Inpcc,58
7
- imdb_sqlite-1.0.1.dist-info/top_level.txt,sha256=mPPacZxoJziQ2beMOavSJ9Yyg_dhGmOkNN5gP_boDSY,12
8
- imdb_sqlite-1.0.1.dist-info/RECORD,,