datanorm 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +154 -0
  3. data/lib/datanorm/document.rb +55 -0
  4. data/lib/datanorm/documents/assemble.rb +43 -0
  5. data/lib/datanorm/documents/assembles/price.rb +85 -0
  6. data/lib/datanorm/documents/assembles/product.rb +176 -0
  7. data/lib/datanorm/documents/preprocess.rb +36 -0
  8. data/lib/datanorm/documents/preprocesses/cache.rb +76 -0
  9. data/lib/datanorm/documents/preprocesses/process.rb +80 -0
  10. data/lib/datanorm/file.rb +65 -0
  11. data/lib/datanorm/header.rb +46 -0
  12. data/lib/datanorm/headers/v4/date.rb +39 -0
  13. data/lib/datanorm/headers/v4/version.rb +36 -0
  14. data/lib/datanorm/headers/v5/date.rb +25 -0
  15. data/lib/datanorm/headers/v5/version.rb +36 -0
  16. data/lib/datanorm/helpers/filename.rb +20 -0
  17. data/lib/datanorm/helpers/utf8.rb +20 -0
  18. data/lib/datanorm/lines/base.rb +67 -0
  19. data/lib/datanorm/lines/parse.rb +33 -0
  20. data/lib/datanorm/lines/v4/dimension.rb +44 -0
  21. data/lib/datanorm/lines/v4/extra.rb +55 -0
  22. data/lib/datanorm/lines/v4/parse.rb +42 -0
  23. data/lib/datanorm/lines/v4/price.rb +120 -0
  24. data/lib/datanorm/lines/v4/priceset.rb +42 -0
  25. data/lib/datanorm/lines/v4/product.rb +90 -0
  26. data/lib/datanorm/lines/v4/text.rb +31 -0
  27. data/lib/datanorm/lines/v5/dimension.rb +22 -0
  28. data/lib/datanorm/lines/v5/parse.rb +29 -0
  29. data/lib/datanorm/lines/v5/price.rb +27 -0
  30. data/lib/datanorm/lines/v5/product.rb +42 -0
  31. data/lib/datanorm/lines/v5/text.rb +30 -0
  32. data/lib/datanorm/logger.rb +15 -0
  33. data/lib/datanorm/logging.rb +27 -0
  34. data/lib/datanorm/progress.rb +26 -0
  35. data/lib/datanorm/version.rb +5 -0
  36. data/lib/datanorm.rb +49 -0
  37. metadata +158 -0
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Datanorm
4
+ module Documents
5
+ module Preprocesses
6
+ # Takes one record of a Datanorm file and saves it to disk in an organized way for later use.
7
+ class Process
8
+ include Calls
9
+ include ::Datanorm::Logging
10
+
11
+ option :workdir
12
+ option :record
13
+
14
+ # One common combo is [A], [B], [D] per product throughout the whole file.
15
+ # In that case, the ID is the same for all three.
16
+ #
17
+ # Another variant is all [T] at the beginning and then [A] etc. at the end of the file.
18
+ # In that case the IDs of [T] are separate and later referenced in [A].
19
+ def call
20
+ if record.kind_dimension? || record.kind_text?
21
+ cache_longtext
22
+
23
+ elsif record.kind_extra?
24
+ cache_json
25
+
26
+ elsif record.kind_priceset?
27
+ cache_priceset
28
+
29
+ elsif record.kind_product?
30
+ cache_product
31
+ end
32
+ end
33
+
34
+ private
35
+
36
+ # Record sets with their own unique IDs that can later be referenced by Products.
37
+ def cache_longtext
38
+ log { "Pre-Processing #{record.id}" }
39
+ ::Datanorm::Documents::Preprocesses::Cache.call(
40
+ workdir:,
41
+ namespace: record.record_kind,
42
+ id: record.id,
43
+ target_line_number: record.line_number,
44
+ content: record.content
45
+ )
46
+ end
47
+
48
+ def cache_json
49
+ ::Datanorm::Documents::Preprocesses::Cache.call(
50
+ workdir:,
51
+ namespace: record.record_kind,
52
+ id: record.id,
53
+ content: record.to_json
54
+ )
55
+ end
56
+
57
+ # One Product has many prices.
58
+ # We create one file per product that has one price per line for that product.
59
+ def cache_priceset
60
+ set_workdir = workdir.join('P')
61
+ FileUtils.mkdir_p(set_workdir)
62
+
63
+ record.prices.each do |price|
64
+ set_workdir.join(::Datanorm::Helpers::Filename.call(price.id)).open('a') do |file|
65
+ file.write("#{price.to_json}\n")
66
+ end
67
+ end
68
+ end
69
+
70
+ # When preprocessing is done, we'll need to loop throuth each product once.
71
+ # So let's append each product to one file, so that we can go through it later.
72
+ def cache_product
73
+ workdir.join('A.txt').open('a') do |file|
74
+ file.write("#{record.to_json}\n")
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Datanorm
4
+ # Parses a datanorm file line by line and wraps them in Ruby objects.
5
+ class File
6
+ include Datanorm::Logging
7
+ include Enumerable
8
+
9
+ def initialize(path:)
10
+ log { "Loading file `#{path}`" }
11
+ @path = path
12
+ end
13
+
14
+ def header
15
+ return @header if defined?(@header)
16
+
17
+ ::File.open(path, "r:#{Encoding::CP850}") do |file|
18
+ first_line = file.gets
19
+ log { 'Parsing header line...' }
20
+
21
+ @header = ::Datanorm::Header.new(line: first_line)
22
+ end
23
+ end
24
+
25
+ # Convenience shortcut.
26
+ def version
27
+ header.version
28
+ end
29
+
30
+ def each
31
+ line_number = 0
32
+
33
+ ::CSV.foreach(path, **options) do |columns|
34
+ line_number += 1
35
+ next if line_number == 1 # Skip header, it's parsed separately
36
+
37
+ yield ::Datanorm::Lines::Parse.call(version:, columns:, source_line_number: line_number)
38
+ end
39
+ end
40
+
41
+ # We want this, so that we can indicate how much progress has been done.
42
+ def lines_count
43
+ return @lines_count if defined?(@lines_count)
44
+
45
+ log { 'Scanning number of total lines... (this takes about 2 seconds per GB)' }
46
+ @lines_count = 0
47
+ # `foreach` doesn't load the entire file into memory.
48
+ ::File.foreach(path, encoding: Encoding::CP850) { @lines_count += 1 }
49
+ log { "Scan complete, counted #{@lines_count} lines." }
50
+ @lines_count
51
+ end
52
+
53
+ private
54
+
55
+ attr_reader :path
56
+
57
+ def options
58
+ {
59
+ encoding: Encoding::CP850,
60
+ col_sep: ';',
61
+ liberal_parsing: true
62
+ }
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Datanorm
4
+ # Represents the first line of a DATANORM file.
5
+ class Header
6
+ def initialize(line:)
7
+ @line = line.to_s
8
+ end
9
+
10
+ def to_s
11
+ "HEADER <V#{version.number}> date <#{date}>"
12
+ end
13
+
14
+ def version
15
+ return @version if defined?(@version)
16
+
17
+ @version = parse_version
18
+ end
19
+
20
+ def date
21
+ return @date if defined?(@date)
22
+
23
+ @date = parse_date
24
+ end
25
+
26
+ private
27
+
28
+ attr_reader :line
29
+
30
+ def parse_version
31
+ # They are mutually exclusive in detecting their own version.
32
+ # I'm willing to bet that there is no canonical exact way to detect it anyway.
33
+ ::Datanorm::Headers::V5::Version.call(line:) ||
34
+ ::Datanorm::Headers::V4::Version.call(line:) ||
35
+ ::Datanorm::Version.new(number: -1, four?: false, five?: false) # Unknown version
36
+ end
37
+
38
+ def parse_date
39
+ if version.five?
40
+ ::Datanorm::Headers::V5::Date.call(line:)
41
+ elsif version.four?
42
+ ::Datanorm::Headers::V4::Date.call(line:)
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Datanorm
4
+ module Headers
5
+ module V4
6
+ # Parses a Date from the raw first line of a DATANORM file.
7
+ class Date
8
+ include Calls
9
+
10
+ option :line
11
+
12
+ def call
13
+ return unless ddmmyy.match?(/\A\d{6}\z/)
14
+
15
+ year = (yy < 50 ? 2000 : 1900) + yy
16
+ ::Date.new(year, mm, dd)
17
+ end
18
+
19
+ private
20
+
21
+ def dd
22
+ ddmmyy[0..1].to_i
23
+ end
24
+
25
+ def mm
26
+ ddmmyy[2..3].to_i
27
+ end
28
+
29
+ def yy
30
+ ddmmyy[4..5].to_i
31
+ end
32
+
33
+ def ddmmyy
34
+ line[2..7]
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Datanorm
4
+ module Headers
5
+ module V4
6
+ # Parses the Version from the raw first line of a DATANORM version 4 file.
7
+ class Version
8
+ include Calls
9
+
10
+ option :line
11
+
12
+ def call
13
+ return if free_use_byte == ';' # Bail out if likely V5
14
+ return unless version_number == '04' # Bail out if not properly V4
15
+
16
+ ::Datanorm::Version.new(number: 4, four?: true, five?: false)
17
+ end
18
+
19
+ private
20
+
21
+ # V4 can have anything here.
22
+ # V5 is in 99.999% of cases a semicolon.
23
+ def free_use_byte
24
+ line[1]
25
+ end
26
+
27
+ # V3 is universally not supported.
28
+ # V4 has '04' at this position
29
+ # V5 doesn't use this and could have anything there.
30
+ def version_number
31
+ line[123..124]
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Datanorm
4
+ module Headers
5
+ module V5
6
+ # Parses a Date from the raw first line of a DATANORM file.
7
+ class Date
8
+ include Calls
9
+
10
+ option :line
11
+
12
+ def call
13
+ # Date.parse(nil) always returns a valid date, so we need to catch that.
14
+ return unless columns[3]
15
+
16
+ ::Date.parse(columns[3])
17
+ end
18
+
19
+ def columns
20
+ line.split(';')
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Datanorm
4
+ module Headers
5
+ module V5
6
+ # Parses the Version from the raw first line of a DATANORM version 5 file.
7
+ class Version
8
+ include Calls
9
+
10
+ option :line
11
+
12
+ def call
13
+ # This is an arbitrary number to see if there are enough semicolons
14
+ # to believe that this could be a v5 file.
15
+ return unless columns.size > 3
16
+ return unless version_number == '050'
17
+
18
+ ::Datanorm::Version.new(number: 5, four?: false, five?: true)
19
+ end
20
+
21
+ private
22
+
23
+ # V4 can have anything here.
24
+ # V5 definitely has "050" here.
25
+ # I'm pretty sure that stands for 0.5.0 and no newer version was ever released.
26
+ def version_number
27
+ columns[1]
28
+ end
29
+
30
+ def columns
31
+ line.split(';')
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Datanorm
4
+ module Helpers
5
+ # Converts a String to something suitable for a filename.
6
+ class Filename
7
+ include Calls
8
+
9
+ param :input
10
+
11
+ def call
12
+ raise "Should not write to file called `#{input.inspect}`" if input.nil?
13
+
14
+ # In case there are special characters in a product number.
15
+ utf8_encoded = ::Datanorm::Helpers::Utf8.call(input)
16
+ "#{Base64.urlsafe_encode64(utf8_encoded.to_s)}.txt"
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Datanorm
4
+ module Helpers
5
+ # Converts a String to UTF-8.
6
+ class Utf8
7
+ include Calls
8
+
9
+ param :input
10
+
11
+ # I sometimes encounter single spaces to indicate nil in Datanorm.
12
+ # So let's filter out those, too.
13
+ def call
14
+ return if input.nil? || input == ' ' || input.to_s.empty?
15
+
16
+ input.to_s.encode('UTF-8')
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Datanorm
4
+ module Lines
5
+ # Object that represents one line of a Datanorm file.
6
+ class Base
7
+ # Array that holds all attributes of one line.
8
+ # In the Datanorm file they are typically separated by seimcolons.
9
+ # Header rows may lack semicolons (in V4), in that case, this Array has only one long String.
10
+ attr_reader :columns
11
+
12
+ # Where in the originating Datanorm file this line is located.
13
+ attr_reader :source_line_number
14
+
15
+ # This class is subclassed by one type per row.
16
+ # Add convenient predicate methods to query the kind of record class.
17
+ # E.g. `Datanorm::Lines::V4::Extra` has `kind_extra?` to be true.
18
+ def self.inherited(subclass)
19
+ kind_method = "kind_#{subclass.name.split('::').last.downcase}?"
20
+
21
+ remove_method(kind_method) if method_defined?(kind_method) # Avoid warnings during tests
22
+ define_method(kind_method) do
23
+ self.class.name.split('::').last.downcase == subclass.name.split('::').last.downcase
24
+ end
25
+
26
+ super
27
+ end
28
+
29
+ def initialize(columns:, source_line_number:)
30
+ @columns = columns
31
+ @source_line_number = source_line_number
32
+ end
33
+
34
+ # Every row has a unique identifier. Most often a product number.
35
+ # Text records commonly have their own IDs, which are not equal to the product number.
36
+ # Multiple lines can have the same ID (e.g. one for price and several for description).
37
+ # Also known as "Satzartenkennzeichen".
38
+ def id
39
+ raise "Implement ##{__method__} in #{self.class}"
40
+ end
41
+
42
+ def as_json
43
+ raise "Implement ##{__method__} in #{self.class}"
44
+ end
45
+
46
+ # The first character in every line always represents the record type.
47
+ # E.g. "T", "A"
48
+ def record_kind
49
+ columns[0]
50
+ end
51
+
52
+ # Overridden in subclasses.
53
+ def to_s
54
+ to_json
55
+ end
56
+
57
+ # Convenience Shortcut to convert attributes from CP850 to UTF-8.
58
+ def encode(...)
59
+ ::Datanorm::Helpers::Utf8.call(...)
60
+ end
61
+
62
+ def to_json(...)
63
+ as_json.to_json(...)
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Datanorm
4
+ module Lines
5
+ # Converts one line of a DATANORM file into a Ruby Object.
6
+ #
7
+ # V: Vorlaufsatz (identifies file metadata, often the first line).
8
+ # K: Kopfsatz (header with catalog or transaction details).
9
+ # A: Artikelsatz (product/product data).
10
+ # B: Zusatzsatz (additional product data, e.g., EAN, packaging).
11
+ # C: Leistungssatz/Konditionensatz (product installation time and public tender descriptions).
12
+ # D: Langtextsatz (long text descriptions).
13
+ # P: Preissatz (price data, often multiple products per line in V5).
14
+ # T: Textbausteinsatz (text modules for descriptions).
15
+ # S: Sonderbedingungssatz (special conditions, less common).
16
+ #
17
+ class Parse
18
+ include Calls
19
+
20
+ option :version
21
+ option :columns
22
+ option :source_line_number
23
+
24
+ def call
25
+ if version.four?
26
+ ::Datanorm::Lines::V4::Parse.call(columns:, source_line_number:)
27
+ elsif version.five?
28
+ ::Datanorm::Lines::V5::Parse.call(columns:, source_line_number:)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Datanorm
4
+ module Lines
5
+ module V4
6
+ # Immediate product description texts. Should take precedence over Text records.
7
+ # Aufbau der „D“ Zeile (Dimensionstextsatz)
8
+ # 1 : Satzartenkennzeichen[1] : Buchstabe D für Dimensionstext
9
+ # 2 : Verarbeitungskennzeichen[1] : N=Neuanlage, L=Löschung, A=Änderung
10
+ # 3 : Artikelnummer[15] : Inhalt alphanumerische Zeichen
11
+ # 4 : Zeilennummer[2/0] : numerisch
12
+ # 5 : Unterkennzeichen[1] : alphanumerisch, F = freier Text,. T = Einfügen von Textblöcken,
13
+ # E = Einfügungen von Textblöcken und Werten
14
+ # 6 : Frei[8] : alphanumerische Zeichen
15
+ # 7 : Zeilentext[40] : alphanumerische Zeichen
16
+ # 8 : Zeilennummer[2/0] : numerisch
17
+ # 9 : Unterkennzeichen[1] : alphanumerisch, F = freier Text,. T = Einfügen von Textblöcken,
18
+ # E = Einfügungen von Textblöcken und Werten
19
+ # 10 : Frei[8] : alphanumerische Zeichen
20
+ # 11 : Zeilentext[40] : alphanumerische Zeichen
21
+ class Dimension < ::Datanorm::Lines::Base
22
+ def to_s
23
+ "<Dimension [#{id}] #{line_number.to_s.rjust(3)} #{content.gsub("\n", '⏎')}>"
24
+ end
25
+
26
+ def id
27
+ encode columns[2]
28
+ end
29
+
30
+ def line_number
31
+ columns[3].to_i
32
+ end
33
+
34
+ def content
35
+ "#{encode(columns[6])}\n#{encode(columns[10])}"
36
+ end
37
+
38
+ def <=>(other)
39
+ line_number <=> other.line_number
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Datanorm
4
+ module Lines
5
+ module V4
6
+ # 1 : Satzartenkennzeichen[1] : Buchstabe B für Hauptsatz 2
7
+ # 2 : Verarbeitungskennzeichen[1] : N=Neuanlage, L=Löschung, A=Änderung
8
+ # 3 : Artikelnummer[15] : Inhalt alphanumerische Zeichen
9
+ # 4 : Matchcode[15] : alphanumerische Zeichen
10
+ # 5 : Alternativ-Artikelnummer[15] : alphanumerische Zeichen
11
+ # 6 : Katalogseite[8] : alphanumerische Zeichen
12
+ # 7 : Bereich für Kupferzuschlag
13
+ # 7a : Bereich für Kupferzuschlag
14
+ # 7b : Bereich für Kupferzuschlag
15
+ # 7c : Bereich für Kupferzuschlag
16
+ # 8 : EAN-Nummer[13] : alphanumerische Zeichen
17
+ # 9 : Anbindungsnummer[12] : alphanumerische Zeichen, zur Anbindung von Bildern
18
+ # 10 : Warengruppe[10] : alphanumerische Zeichen s.a. .WRG-Datei
19
+ # 11 : Kostenart[2/0] : numerisch
20
+ # 12 : Verpackungsmenge[5/0] : numerisch
21
+ # 13 : Referenznummer-Erstellerkürzel[4] : alphanumerische Zeichen
22
+ # 14 : Referenznummer[17] : alphanumerische Zeichen
23
+ class Extra < ::Datanorm::Lines::Base
24
+ def to_s
25
+ "EXTRA [#{id}] #{"{#{matchcode}}" unless matchcode.empty?} EAN: #{ean}"
26
+ end
27
+
28
+ def id
29
+ encode columns[2]
30
+ end
31
+
32
+ # This is like a tag. E.g. a product category.
33
+ def matchcode
34
+ encode columns[3].to_s.strip
35
+ end
36
+
37
+ def alternative_id
38
+ encode columns[4]
39
+ end
40
+
41
+ def ean
42
+ encode columns[9]
43
+ end
44
+
45
+ def category_id
46
+ encode columns[11]
47
+ end
48
+
49
+ def as_json
50
+ { alternative_id:, matchcode:, ean:, category_id: }
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Datanorm
4
+ module Lines
5
+ module V4
6
+ # Converts a single DATANORM v4 line into a Ruby Object.
7
+ #
8
+ class Parse
9
+ include Calls
10
+
11
+ # Vorlaufsatz "V": keine Kennzeichen
12
+ # Kundenkontrollsatz "K": keine Kennzeichen
13
+ # Warengruppensatz "S": keine Kennzeichen
14
+ # Rabattsatz "R": keine Kennzeichen
15
+ # Hauptsatz 1 "A": N = Neuanlage; L = Löschung; A = Änderung; X = Artikelnummernänderung
16
+ # Hauptsatz 2 "B": N = Neuanlage; ; A = Änderung
17
+ # Dimensionssatz "D": N = Neuanlage; A = Änderung; L = Löschung
18
+ # Langtextsatz "T": N = Neuanlage; A = Änderung; L = Löschung
19
+ # Einfügesatz "E": N = Neuanlage; A = Änderung; L = Löschung
20
+ # Staffelpreiszu-/-abschlagssatz "Z": N = Neuanlage; A = Änderung; L = Löschung
21
+ # Leistungssatz "C": N = Neuanlage; A = Änderung; L = Löschung
22
+ # Artikel-Set-Satz "J": N = Neuanlage; A = Änderung; L = Löschung
23
+ # Preisänderungssatz "P": A = Änderung; P = Preisänderung
24
+ CLASSES = {
25
+ 'A' => Datanorm::Lines::V4::Product,
26
+ 'B' => Datanorm::Lines::V4::Extra,
27
+ 'D' => Datanorm::Lines::V4::Dimension,
28
+ 'T' => Datanorm::Lines::V4::Text,
29
+ 'P' => Datanorm::Lines::V4::Priceset
30
+ }.freeze
31
+
32
+ option :columns
33
+ option :source_line_number
34
+
35
+ def call
36
+ klass = CLASSES.fetch(columns.first[0], Datanorm::Lines::Base)
37
+ klass.new(columns:, source_line_number:)
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end