imdb-sqlite 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
imdb_sqlite/__main__.py CHANGED
@@ -109,7 +109,8 @@ TSV_TABLE_MAP = OrderedDict([
109
109
  class Database:
110
110
  """ Shallow DB abstraction """
111
111
 
112
- def __init__(self, uri=':memory:'):
112
+ def __init__(self, table_map, uri=':memory:'):
113
+ self.table_map = table_map
113
114
  exists = os.path.exists(uri)
114
115
  self.connection = sqlite3.connect(uri, isolation_level=None)
115
116
  self.connection.executescript("""
@@ -128,14 +129,14 @@ class Database:
128
129
 
129
130
  def create_tables(self):
130
131
  sqls = [self._create_table_sql(table, mapping.values())
131
- for table, mapping in TSV_TABLE_MAP.values()]
132
+ for table, mapping in self.table_map.values()]
132
133
  sql = '\n'.join(sqls)
133
134
  logger.debug(sql)
134
135
  self.connection.executescript(sql)
135
136
 
136
137
  def create_indices(self):
137
138
  sqls = [self._create_index_sql(table, mapping.values())
138
- for table, mapping in TSV_TABLE_MAP.values()]
139
+ for table, mapping in self.table_map.values()]
139
140
  sql = '\n'.join([s for s in sqls if s])
140
141
  logger.debug(sql)
141
142
  for stmt in tqdm(sql.split('\n'), unit='index'):
@@ -286,11 +287,22 @@ def import_file(db, filename, table, column_mapping):
286
287
  raise
287
288
 
288
289
 
290
+ def filter_table_subset(table_map, wanted_tables):
291
+ def split_csv(s): return [v for v in (v.strip() for v in s.split(',')) if v]
292
+
293
+ wanted_tables = split_csv(wanted_tables)
294
+ out = OrderedDict()
295
+ for filename, (table_name, table_spec) in table_map.items():
296
+ if table_name in wanted_tables:
297
+ out[filename] = (table_name, table_spec)
298
+ return out
299
+
300
+
289
301
  def main():
290
302
  parser = argparse.ArgumentParser(
291
303
  formatter_class=argparse.ArgumentDefaultsHelpFormatter,
292
- description='Imports imdb tsv interface files into a new sqlite'
293
- 'database. Fetches them from imdb if not present on'
304
+ description='Imports imdb tsv interface files into a new sqlite '
305
+ 'database. Fetches them from imdb if not present on '
294
306
  'the machine.'
295
307
  )
296
308
  parser.add_argument('--db', metavar='FILE', default='imdb.db',
@@ -300,6 +312,9 @@ def main():
300
312
  parser.add_argument('--no-index', action='store_true',
301
313
  help='Do not create any indices. Massively slower joins, but cuts the DB file size '
302
314
  'approximately in half')
315
+ parser.add_argument('--only', metavar='TABLES',
316
+ help='Import only a some tables. The tables to import are specified using a comma delimited '
317
+ 'list, such as "people,titles". Use it to save storage space.')
303
318
  parser.add_argument('--verbose', action='store_true',
304
319
  help='Show database interaction')
305
320
  opts = parser.parse_args()
@@ -312,11 +327,13 @@ def main():
312
327
  logger.warning('DB already exists: ({db}). Refusing to modify. Exiting'.format(db=opts.db))
313
328
  return 1
314
329
 
315
- ensure_downloaded(TSV_TABLE_MAP.keys(), opts.cache_dir)
330
+ table_map = filter_table_subset(TSV_TABLE_MAP, opts.only) if opts.only else TSV_TABLE_MAP
331
+
332
+ ensure_downloaded(table_map.keys(), opts.cache_dir)
316
333
  logger.info('Populating database: {}'.format(opts.db))
317
- db = Database(uri=opts.db)
334
+ db = Database(table_map=table_map, uri=opts.db)
318
335
 
319
- for filename, table_mapping in TSV_TABLE_MAP.items():
336
+ for filename, table_mapping in table_map.items():
320
337
  table, column_mapping = table_mapping
321
338
  import_file(db, os.path.join(opts.cache_dir, filename),
322
339
  table, column_mapping)
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: imdb-sqlite
3
- Version: 1.1.0
3
+ Version: 1.2.0
4
4
  Summary: Imports IMDB TSV files into a SQLite database
5
5
  Home-page: https://github.com/jojje/imdb-sqlite
6
6
  Author: Jonas Tingeborn
@@ -12,7 +12,17 @@ Classifier: License :: OSI Approved :: GNU General Public License v2 (GPLv2)
12
12
  Classifier: Operating System :: OS Independent
13
13
  Description-Content-Type: text/markdown
14
14
  License-File: LICENSE
15
- Requires-Dist: tqdm (>=4.4.1)
15
+ Requires-Dist: tqdm>=4.4.1
16
+ Dynamic: author
17
+ Dynamic: author-email
18
+ Dynamic: classifier
19
+ Dynamic: description
20
+ Dynamic: description-content-type
21
+ Dynamic: home-page
22
+ Dynamic: license
23
+ Dynamic: license-file
24
+ Dynamic: requires-dist
25
+ Dynamic: summary
16
26
 
17
27
  # imdb-sqlite
18
28
  Imports IMDB TSV files into a SQLite database.
@@ -36,8 +46,8 @@ The program relies on the following IMDB tab separated files:
36
46
 
37
47
  usage: imdb-sqlite [OPTIONS]
38
48
 
39
- Imports imdb tsv interface files into a new sqlitedatabase. Fetches them from
40
- imdb if not present onthe machine.
49
+ Imports imdb tsv interface files into a new sqlit database. Fetches them from imdb
50
+ if not present on the machine.
41
51
 
42
52
  optional arguments:
43
53
  -h, --help show this help message and exit
@@ -45,15 +55,19 @@ The program relies on the following IMDB tab separated files:
45
55
  --cache-dir DIR Download cache dir where the tsv files from imdb will be stored
46
56
  before the import (default: downloads)
47
57
  --no-index Do not create any indices. Massively slower joins, but cuts the DB
48
- file size approximately in half (default: False)
49
- --verbose Show database interaction (default: False)
58
+ file size approximately in half
59
+ --only TABLES Import only some tables. The tables to import are specified using
60
+ a comma delimited list, such as "people,titles". Use it to save
61
+ storage space.
62
+ --verbose Show database interaction
50
63
 
51
64
  Just run the program with no arguments, and you'll get a file named `imdb.db`
52
65
  in the current working directory.
53
66
 
54
67
  ### Hints
55
68
  * Make sure the disk the database is written to has sufficient space.
56
- About 5 GiB is needed.
69
+ About 19 GiB is needed as of early 2026. About 9.5 GB without indices.
70
+ (for even less storage requirement, see Disk space tips below).
57
71
  * Use a SSD to speed up the import.
58
72
  * To check the best case import performance, use an in-memory database:
59
73
  `--db :memory:`.
@@ -61,50 +75,53 @@ in the current working directory.
61
75
  ## Example
62
76
 
63
77
  $ imdb-sqlite
78
+
79
+ 2026-02-04 16:30:31,311 Populating database: imdb.db
80
+ 2026-02-04 16:30:31,319 Applying schema
81
+
82
+ 2026-02-04 16:30:31,323 Importing file: downloads\name.basics.tsv.gz
83
+ 2026-02-04 16:30:31,324 Reading number of rows ...
84
+ 2026-02-04 16:30:34,373 Inserting rows into table: people
85
+ 100%|██████████████████| 15063390/15063390 [01:05<00:00, 228659.33 rows/s]
64
86
 
65
- 2018-07-08 16:00:00,000 Populating database: imdb.db
66
- 2018-07-08 16:00:00,001 Applying schema
67
-
68
- 2018-07-08 16:00:00,005 Importing file: downloads\name.basics.tsv.gz
69
- 2018-07-08 16:00:00,005 Reading number of rows ...
70
- 2018-07-08 16:00:11,521 Inserting rows into table: people
71
- 100%|█████████████████████████| 8699964/8699964 [01:23<00:00, 104387.75 rows/s]
72
-
73
- 2018-07-08 16:01:34,868 Importing file: downloads\title.basics.tsv.gz
74
- 2018-07-08 16:01:34,868 Reading number of rows ...
75
- 2018-07-08 16:01:41,873 Inserting rows into table: titles
76
- 100%|██████████████████████████| 5110779/5110779 [00:58<00:00, 87686.98 rows/s]
77
-
78
- 2018-07-08 16:02:40,161 Importing file: downloads\title.akas.tsv.gz
79
- 2018-07-08 16:02:40,161 Reading number of rows ...
80
- 2018-07-08 16:02:44,743 Inserting rows into table: akas
81
- 100%|██████████████████████████| 3625334/3625334 [00:37<00:00, 97412.94 rows/s]
82
-
83
- 2018-07-08 16:03:21,964 Importing file: downloads\title.principals.tsv.gz
84
- 2018-07-08 16:03:21,964 Reading number of rows ...
85
- 2018-07-08 16:03:55,922 Inserting rows into table: crew
86
- 100%|███████████████████████| 28914893/28914893 [03:45<00:00, 128037.21 rows/s]
87
-
88
- 2018-07-08 16:07:41,757 Importing file: downloads\title.episode.tsv.gz
89
- 2018-07-08 16:07:41,757 Reading number of rows ...
90
- 2018-07-08 16:07:45,370 Inserting rows into table: episodes
91
- 100%|█████████████████████████| 3449903/3449903 [00:21<00:00, 158265.16 rows/s]
92
-
93
- 2018-07-08 16:08:07,172 Importing file: downloads\title.ratings.tsv.gz
94
- 2018-07-08 16:08:07,172 Reading number of rows ...
95
- 2018-07-08 16:08:08,029 Inserting rows into table: ratings
96
- 100%|███████████████████████████| 846901/846901 [00:05<00:00, 152421.27 rows/s]
97
-
98
- 2018-07-08 16:08:13,589 Creating table indices ...
99
- 2018-07-08 16:09:16,451 Import successful
87
+ 2026-02-04 16:31:40,262 Importing file: downloads\title.basics.tsv.gz
88
+ 2026-02-04 16:31:40,262 Reading number of rows ...
89
+ 2026-02-04 16:31:42,777 Inserting rows into table: titles
90
+ 100%|██████████████████| 12265715/12265715 [01:06<00:00, 185564.42 rows/s]
91
+
92
+ 2026-02-04 16:32:48,879 Importing file: downloads\title.akas.tsv.gz
93
+ 2026-02-04 16:32:48,880 Reading number of rows ...
94
+ 2026-02-04 16:32:54,646 Inserting rows into table: akas
95
+ 100%|██████████████████| 54957563/54957563 [04:06<00:00, 222556.12 rows/s]
96
+
97
+ 2026-02-04 16:37:01,586 Importing file: downloads\title.principals.tsv.gz
98
+ 2026-02-04 16:37:01,587 Reading number of rows ...
99
+ 2026-02-04 16:37:11,294 Inserting rows into table: crew
100
+ 100%|██████████████████| 97617046/97617046 [06:27<00:00, 251790.20 rows/s]
101
+
102
+ 2026-02-04 16:43:38,990 Importing file: downloads\title.episode.tsv.gz
103
+ 2026-02-04 16:43:38,990 Reading number of rows ...
104
+ 2026-02-04 16:43:39,635 Inserting rows into table: episodes
105
+ 100%|████████████████████| 9462887/9462887 [00:29<00:00, 315650.53 rows/s]
106
+
107
+ 2026-02-04 16:44:09,618 Importing file: downloads\title.ratings.tsv.gz
108
+ 2026-02-04 16:44:09,618 Reading number of rows ...
109
+ 2026-02-04 16:44:09,706 Inserting rows into table: ratings
110
+ 100%|████████████████████| 1631810/1631810 [00:05<00:00, 304073.42 rows/s]
111
+
112
+ 2026-02-04 16:44:15,077 Creating table indices ...
113
+ 100%|██████████████████████████████████| 12/12 [03:19<00:00, 16.64s/index]
114
+
115
+ 2026-02-04 16:47:34,781 Analyzing DB to generate statistic for query planner ...
116
+ 2026-02-04 16:48:01,367 Import successful
100
117
 
101
118
 
102
119
  ### Note
103
120
  The import may take a long time, since there are millions of records to
104
121
  process.
105
122
 
106
- The above example used python 3.6.4 on windows 7, with the working directory
107
- being on a SSD.
123
+ The above example used python 3.10.13 on windows 10, with the working directory
124
+ being on a fast Kingston NVME SSD.
108
125
 
109
126
  ## Data model
110
127
 
@@ -118,7 +135,8 @@ reference it is in order.
118
135
 
119
136
  A movie has a title, a TV show has one. An episode has one as well. Well two
120
137
  actually; the title of the show, and the title of the episode itself. That is
121
- why there are two links to the same `title_id` attribute in the `titles` table.
138
+ why there are two links to the same `title_id` attribute in the `titles` table,
139
+ from the `episodes` table.
122
140
 
123
141
  To make the relationships a bit clearer, following are a few query examples
124
142
 
@@ -188,7 +206,7 @@ massive query speedup.
188
206
  For example `sqlite3 imdb.db "CREATE INDEX myindex ON <table-name> (<slow-column>)"`
189
207
 
190
208
  ### Disk space tips
191
- The imported data as of 2023 produces a database file that is about 12 GiB.
209
+ The imported data as of 2026 produces a database file that is about 19 GiB.
192
210
  About half of that space is for indices used to speed up query lookups and
193
211
  joins. The default indices take up about as much as the data.
194
212
 
@@ -197,7 +215,7 @@ ETL-step, for refreshing the dataset every now and then, and then simply export
197
215
  the full tables (e.g. for data science using pandas/ML), a `--no-index` flag is
198
216
  available. When specifying this flag, no indices will be created, which not
199
217
  only saves about 50% disk space, but also speeds up the overall import process.
200
- When this flag is provided, the DB file will be just shy of 6 GiB as of date of
218
+ When this flag is provided, the DB file will be just 9.5 GiB as of date of
201
219
  writing.
202
220
 
203
221
  If you know precisely which indices you need, omitting the default indices may
@@ -205,6 +223,43 @@ also be a good idea, since you'd then not waste disk space on indices you don't
205
223
  need. Simply create the indices you _do_ need manually, as illustrated in the
206
224
  performance tip above.
207
225
 
226
+ As an indicator, following is the current space consumption spread across the tables.
227
+
228
+ Full import
229
+
230
+ * default (includes indices): 19 GB
231
+ * without indices: 9.5 GB
232
+
233
+ Sizes of the respective tables when doing selective import of only a single
234
+ table without indices.
235
+
236
+ ```
237
+ * crew: 46% (4.4 GB)
238
+ * akas: 28% (2.7 GB)
239
+ * titles: 14% (1.3 GB)
240
+ * people: 8% (0.8 GB)
241
+ * episodes: 3% (0.3 GB)
242
+ * ratings: 1% (0.1 GB)
243
+ ```
244
+
245
+ Percentages are the relative space consumption of the full index-free import
246
+ (~9.5 GB).
247
+
248
+ Fair to say, "who played what character", or "fetched a doughnut to what
249
+ VIP-of-wasting-space" accounts for about half the storage. If you can live
250
+ without those details then there's a massive storage saving to be made. Also, if
251
+ you don't need all the aliases for all the titles, like the portuguese title of
252
+ some bollywood flick, then the akas can also be skipped. Getting rid of those
253
+ two tables shaves off 3/4 of the required space. That's significant.
254
+
255
+ If you don't care about characters, and just want to query moves or shows, their
256
+ ratings and perhaps per-episode ratings as well, then 2 GiB of storage suffices
257
+ as you only need tables titles, episodes and ratings. However if you actually
258
+ want to query those tables as well, then you'd want to create indices, either
259
+ manually or use the default. This ups the space requirement about 50% (3GB).
260
+ I.e. just provide the command line argument `--only titles,ratings,episodes`.
261
+
262
+
208
263
  ## PyPI
209
264
  Current status of the project is:
210
265
  [![Build Status](https://github.com/jojje/imdb-sqlite/actions/workflows/python-publish.yml/badge.svg)](https://github.com/jojje/imdb-sqlite/actions/workflows/python-publish.yml)
@@ -0,0 +1,8 @@
1
+ imdb_sqlite/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ imdb_sqlite/__main__.py,sha256=w_xmBuRxj6-Cg31ITwnkW5QYWaco3E-3HYoiDZ_AZYk,13121
3
+ imdb_sqlite-1.2.0.dist-info/licenses/LICENSE,sha256=gXf5dRMhNSbfLPYYTY_5hsZ1r7UU1OaKQEAQUhuIBkM,18092
4
+ imdb_sqlite-1.2.0.dist-info/METADATA,sha256=J3N9Z7f7K_LYNcDLAPogrMOGiKkXO6Z222QWi15v1EM,12011
5
+ imdb_sqlite-1.2.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
6
+ imdb_sqlite-1.2.0.dist-info/entry_points.txt,sha256=0qU1qhre6z6V8Q_q0QxnKvmx2c8BVjhPGqIg9-Inpcc,58
7
+ imdb_sqlite-1.2.0.dist-info/top_level.txt,sha256=mPPacZxoJziQ2beMOavSJ9Yyg_dhGmOkNN5gP_boDSY,12
8
+ imdb_sqlite-1.2.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.40.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,8 +0,0 @@
1
- imdb_sqlite/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- imdb_sqlite/__main__.py,sha256=U7XxeHdV1up2X2VOGKPYPFqAFL-majAglYVWyarsnas,12323
3
- imdb_sqlite-1.1.0.dist-info/LICENSE,sha256=gXf5dRMhNSbfLPYYTY_5hsZ1r7UU1OaKQEAQUhuIBkM,18092
4
- imdb_sqlite-1.1.0.dist-info/METADATA,sha256=R6P2TzoZLZSneH2Xe6faC6lLsrcdirmfagP1tyCoAwU,9927
5
- imdb_sqlite-1.1.0.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
6
- imdb_sqlite-1.1.0.dist-info/entry_points.txt,sha256=0qU1qhre6z6V8Q_q0QxnKvmx2c8BVjhPGqIg9-Inpcc,58
7
- imdb_sqlite-1.1.0.dist-info/top_level.txt,sha256=mPPacZxoJziQ2beMOavSJ9Yyg_dhGmOkNN5gP_boDSY,12
8
- imdb_sqlite-1.1.0.dist-info/RECORD,,