e621_export_downloader 0.0.13 → 0.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a133742764b3b599b2cfd4c3494391538730be19daae10e9df43956aa1913709
4
- data.tar.gz: ec28fc8889a2ad4bef174f365985e89868068fe2f206b7325c65a3c881716942
3
+ metadata.gz: 89d525162aa1da43623f66a8bab4618ac17910c2c3e0db7a272e8149be91efb8
4
+ data.tar.gz: 6152f2a56d9ad995dd5868f2762d4a95c38e2b4d95a289b8806ef74870b50ea7
5
5
  SHA512:
6
- metadata.gz: 74262c9fc4b22df8847f75e80abb3ac9951fa7e265a306b168dcf087c64946849c2df5432d75e77a1ae8b0cc5d5869addb2348bd3b217abc7dcf02395e316521
7
- data.tar.gz: 61dfc91a23e7c34c69562534f9a42583f362558391264d12c4761b3f2368b9633f8d0f356939efcb78a3a27d71d5a2cefbf8047d536526e02dc8d41ec2b04853
6
+ metadata.gz: 494ccb09c9727311ec327cdc8926816bd0957f9d0a9455b6b5789364b30d4728d95763a99707b5861fb1d858092acaed1c220abd97dfb22f3f66e9de05fc6eaf
7
+ data.tar.gz: a0dbdb4a693365f88b5986ffcdef1a23190a3c6ce85e98297d344bf74b91bf4794700d5f0cdf4ab087cb695e0017f6464d21dfe5013419a29930d438e1786f43
@@ -1,8 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
  # typed: true
3
3
 
4
- require("csv")
5
-
6
4
  module E621
7
5
  module CsvImportable
8
6
  extend(T::Sig)
@@ -17,22 +15,84 @@ module E621
17
15
  E621::RowCount.set(T.unsafe(self).table_name.split(".").last, count)
18
16
  end
19
17
 
20
- sig { params(csv_path: String).returns(Integer) }
21
- def import_from_csv(csv_path)
18
+ # Loads a CSV export into this model's table via PostgreSQL COPY.
19
+ #
20
+ # The file's raw bytes are streamed straight into COPY's native CSV parser
21
+ # (only the header line is read, for the column list) rather than parsing
22
+ # each row into a CSV::Row and re-serializing it in Ruby. That avoids a
23
+ # full single-threaded parse + re-encode of every row and preserves the
24
+ # exact bytes of fields containing embedded newlines/quotes. The row count
25
+ # is taken from COPY itself (PG::Result#cmd_tuples).
26
+ #
27
+ # truncate: empty the table before loading (full reload).
28
+ # recreate_indexes: drop every secondary (non-PK, non-constraint) index
29
+ # before the COPY and rebuild it afterward. Building the
30
+ # GIN/btree indexes once over the finished table is far
31
+ # cheaper than maintaining them row-by-row during COPY,
32
+ # and CREATE INDEX can use parallel workers.
33
+ #
34
+ # Physical-storage policy (UNLOGGED) and session tuning
35
+ # (maintenance_work_mem, max_parallel_maintenance_workers, ...) are the
36
+ # caller's responsibility — set them around this call.
37
+ sig do
38
+ params(
39
+ csv_path: String,
40
+ truncate: T::Boolean,
41
+ recreate_indexes: T::Boolean,
42
+ chunk_bytes: Integer,
43
+ ).returns(Integer)
44
+ end
45
+ def import_from_csv(csv_path, truncate: false, recreate_indexes: false, chunk_bytes: 1 << 20)
46
+ model = T.unsafe(self)
47
+ indexes = recreate_indexes ? secondary_index_definitions : {}
48
+
49
+ model.connection.execute("TRUNCATE #{model.quoted_table_name}") if truncate
50
+ indexes.each_key { |name| model.connection.execute("DROP INDEX IF EXISTS #{name}") }
51
+
52
+ count = copy_csv(csv_path, chunk_bytes: chunk_bytes)
53
+
54
+ indexes.each_value { |ddl| model.connection.execute(ddl) }
55
+ self.row_count = count
56
+ count
57
+ end
58
+
59
+ private
60
+
61
+ sig { params(csv_path: String, chunk_bytes: Integer).returns(Integer) }
62
+ def copy_csv(csv_path, chunk_bytes:)
22
63
  model = T.unsafe(self)
23
- csv_headers = File.open(csv_path, "rb", &:readline).chomp.split(",").map(&:strip)
24
- columns = csv_headers.map { |h| model.connection.quote_column_name(h) }.join(", ")
64
+ header = File.open(csv_path, "rb", &:readline)
65
+ columns = header.chomp.split(",").map { |h| model.connection.quote_column_name(h.strip) }.join(", ")
25
66
 
26
- count = 0
27
67
  raw = model.connection.raw_connection
28
- raw.copy_data("COPY #{model.quoted_table_name} (#{columns}) FROM STDIN WITH (FORMAT CSV)") do
29
- CSV.foreach(csv_path, headers: true) do |row|
30
- raw.put_copy_data(CSV.generate_line(T.cast(row, CSV::Row).fields))
31
- count += 1
68
+ result = raw.copy_data("COPY #{model.quoted_table_name} (#{columns}) FROM STDIN WITH (FORMAT CSV)") do
69
+ File.open(csv_path, "rb") do |io|
70
+ io.readline # skip header row
71
+ while (chunk = io.read(chunk_bytes))
72
+ raw.put_copy_data(chunk)
73
+ end
32
74
  end
33
75
  end
34
- self.row_count = count
35
- count
76
+ result.cmd_tuples
77
+ end
78
+
79
+ # Every non-primary-key, non-constraint-backed index on the table, paired
80
+ # with the DDL needed to recreate it.
81
+ sig { returns(T::Hash[String, String]) }
82
+ def secondary_index_definitions
83
+ model = T.unsafe(self)
84
+ conn = model.connection
85
+ sql = <<~SQL
86
+ SELECT i.indexrelid::regclass::text AS name,
87
+ pg_get_indexdef(i.indexrelid) AS ddl
88
+ FROM pg_index i
89
+ WHERE i.indrelid = #{conn.quote(model.quoted_table_name)}::regclass
90
+ AND NOT i.indisprimary
91
+ AND NOT EXISTS (
92
+ SELECT 1 FROM pg_constraint c WHERE c.conindid = i.indexrelid
93
+ )
94
+ SQL
95
+ conn.exec_query(sql).to_h { |r| [r["name"], r["ddl"]] }
36
96
  end
37
97
  end
38
98
  end
@@ -4,7 +4,7 @@
4
4
  # loaded by bundler
5
5
  module E621ExportDownloader
6
6
  module Constants
7
- VERSION = "0.0.13"
7
+ VERSION = "0.0.14"
8
8
  WEBSITE = "https://github.com/DonovanDMC/E621ExportDownloader.rb"
9
9
  end
10
10
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: e621_export_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.13
4
+ version: 0.0.14
5
5
  platform: ruby
6
6
  authors:
7
7
  - Donovan_DMC
8
8
  bindir: exe
9
9
  cert_chain: []
10
- date: 2026-06-13 00:00:00.000000000 Z
10
+ date: 2026-06-14 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: csv