nameday_vvc_pdf_extractor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 9fe97f35bc0664cff59f3654a269aded15746d1ea09d54de83a2226cc32ef65e
4
+ data.tar.gz: c4bffc353a1ef7194f598bf5201c38f0e1363c2166c80cde6b7e9e6f14e3707f
5
+ SHA512:
6
+ metadata.gz: 953a5af7e247352891d97af1c72090bb40465590315dec2570e2a3b11abd4ee22315de820e729eef0551b5e3745e5d7dceaef8b0f3e784bb9b7cece0b87ed4e6
7
+ data.tar.gz: 18088a356cd9d77766012de04b536002503412f157fbdab41dbacf31a05f3f02f0ae0369b0e77e283d8ec9304096cbcc1aeabf5fcdadda1deee9b650555069e9
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pdf-reader"
4
+ require "date"
5
+
6
+ module Nameday
7
+ class VvcPdfExtractor
8
+ EMPTY_NAMEDAY_REGEXP = /\p{Pd}/ # Unicode category "Punctuation: Dash"
9
+ TEXT_ROW_DELIMITER = "\n"
10
+ MONTH_NAMES = %w[
11
+ -
12
+ JANVĀRIS
13
+ FEBRUĀRIS
14
+ MARTS
15
+ APRĪLIS
16
+ MAIJS
17
+ JŪNIJS
18
+ JŪLIJS
19
+ AUGUSTS
20
+ SEPTEMBRIS
21
+ OKTOBRIS
22
+ NOVEMBRIS
23
+ DECEMBRIS
24
+ ].freeze
25
+
26
+ attr_reader :output
27
+
28
+ def initialize
29
+ @output = {}
30
+ end
31
+
32
+ def read_pdf(file_name)
33
+ @pdf_reader = ::PDF::Reader.new(file_name)
34
+ end
35
+
36
+ def extract
37
+ raise("PDF not opened!") unless @pdf_reader
38
+ process_pdf
39
+ output
40
+ end
41
+
42
+ private
43
+
44
+ def prepare_output
45
+ @output = {}
46
+
47
+ # 2016 was leap year
48
+ Date.new(2016).step(Date.new(2017)) do |date|
49
+ @output[date.month] ||= {}
50
+ @output[date.month][date.day] = nil
51
+ end
52
+ end
53
+
54
+ def process_pdf
55
+ return unless @output == {}
56
+ prepare_output
57
+
58
+ @current_month_index = nil
59
+ @pdf_reader.pages.each do |page|
60
+ process_pdf_page(page)
61
+ end
62
+ end
63
+
64
+ def process_pdf_page(pdf_page)
65
+ text_rows = pdf_page.text.split(TEXT_ROW_DELIMITER).map!(&:strip)
66
+ text_rows.each do |text_row|
67
+ next if text_row.empty?
68
+
69
+ if (new_month_index = MONTH_NAMES.index(text_row))
70
+ @current_month_index = new_month_index
71
+ elsif text_row.match?(/^\d+\./)
72
+ process_nameday_value(text_row)
73
+ end
74
+ end
75
+ end
76
+
77
+ def process_nameday_value(text_row)
78
+ nameday_data = text_row.split(" ")
79
+ day = nameday_data[0].to_i # Ignores ending dot
80
+ nameday_data[1..-1].each do |name|
81
+ next if name.match?(EMPTY_NAMEDAY_REGEXP)
82
+ @output[@current_month_index][day] ||= []
83
+ @output[@current_month_index][day] << name.tr(",", "")
84
+ end
85
+ end
86
+ end
87
+ end
metadata ADDED
@@ -0,0 +1,58 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: nameday_vvc_pdf_extractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Aleksandrs Ļedovskis
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-10-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: pdf-reader
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.1'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.1'
27
+ description:
28
+ email: aleksandrs@ledovskis.lv
29
+ executables: []
30
+ extensions: []
31
+ extra_rdoc_files: []
32
+ files:
33
+ - lib/nameday_vvc_pdf_extractor.rb
34
+ homepage: https://github.com/aleksandrs-ledovskis/nameday-vvc-pdf-extractor
35
+ licenses:
36
+ - BSD-3-Clause
37
+ metadata: {}
38
+ post_install_message:
39
+ rdoc_options: []
40
+ require_paths:
41
+ - lib
42
+ required_ruby_version: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '2.0'
47
+ required_rubygems_version: !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ version: '0'
52
+ requirements: []
53
+ rubyforge_project:
54
+ rubygems_version: 2.7.6
55
+ signing_key:
56
+ specification_version: 4
57
+ summary: Nameday data extraction from Valsts valodas centrs PDF
58
+ test_files: []