nameday_vvc_pdf_extractor 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 9fe97f35bc0664cff59f3654a269aded15746d1ea09d54de83a2226cc32ef65e
4
+ data.tar.gz: c4bffc353a1ef7194f598bf5201c38f0e1363c2166c80cde6b7e9e6f14e3707f
5
+ SHA512:
6
+ metadata.gz: 953a5af7e247352891d97af1c72090bb40465590315dec2570e2a3b11abd4ee22315de820e729eef0551b5e3745e5d7dceaef8b0f3e784bb9b7cece0b87ed4e6
7
+ data.tar.gz: 18088a356cd9d77766012de04b536002503412f157fbdab41dbacf31a05f3f02f0ae0369b0e77e283d8ec9304096cbcc1aeabf5fcdadda1deee9b650555069e9
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pdf-reader"
4
+ require "date"
5
+
6
+ module Nameday
7
+ class VvcPdfExtractor
8
+ EMPTY_NAMEDAY_REGEXP = /\p{Pd}/ # Unicode category "Punctuation: Dash"
9
+ TEXT_ROW_DELIMITER = "\n"
10
+ MONTH_NAMES = %w[
11
+ -
12
+ JANVĀRIS
13
+ FEBRUĀRIS
14
+ MARTS
15
+ APRĪLIS
16
+ MAIJS
17
+ JŪNIJS
18
+ JŪLIJS
19
+ AUGUSTS
20
+ SEPTEMBRIS
21
+ OKTOBRIS
22
+ NOVEMBRIS
23
+ DECEMBRIS
24
+ ].freeze
25
+
26
+ attr_reader :output
27
+
28
+ def initialize
29
+ @output = {}
30
+ end
31
+
32
+ def read_pdf(file_name)
33
+ @pdf_reader = ::PDF::Reader.new(file_name)
34
+ end
35
+
36
+ def extract
37
+ raise("PDF not opened!") unless @pdf_reader
38
+ process_pdf
39
+ output
40
+ end
41
+
42
+ private
43
+
44
+ def prepare_output
45
+ @output = {}
46
+
47
+ # 2016 was leap year
48
+ Date.new(2016).step(Date.new(2017)) do |date|
49
+ @output[date.month] ||= {}
50
+ @output[date.month][date.day] = nil
51
+ end
52
+ end
53
+
54
+ def process_pdf
55
+ return unless @output == {}
56
+ prepare_output
57
+
58
+ @current_month_index = nil
59
+ @pdf_reader.pages.each do |page|
60
+ process_pdf_page(page)
61
+ end
62
+ end
63
+
64
+ def process_pdf_page(pdf_page)
65
+ text_rows = pdf_page.text.split(TEXT_ROW_DELIMITER).map!(&:strip)
66
+ text_rows.each do |text_row|
67
+ next if text_row.empty?
68
+
69
+ if (new_month_index = MONTH_NAMES.index(text_row))
70
+ @current_month_index = new_month_index
71
+ elsif text_row.match?(/^\d+\./)
72
+ process_nameday_value(text_row)
73
+ end
74
+ end
75
+ end
76
+
77
+ def process_nameday_value(text_row)
78
+ nameday_data = text_row.split(" ")
79
+ day = nameday_data[0].to_i # Ignores ending dot
80
+ nameday_data[1..-1].each do |name|
81
+ next if name.match?(EMPTY_NAMEDAY_REGEXP)
82
+ @output[@current_month_index][day] ||= []
83
+ @output[@current_month_index][day] << name.tr(",", "")
84
+ end
85
+ end
86
+ end
87
+ end
metadata ADDED
@@ -0,0 +1,58 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: nameday_vvc_pdf_extractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Aleksandrs Ļedovskis
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-10-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: pdf-reader
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.1'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.1'
27
+ description:
28
+ email: aleksandrs@ledovskis.lv
29
+ executables: []
30
+ extensions: []
31
+ extra_rdoc_files: []
32
+ files:
33
+ - lib/nameday_vvc_pdf_extractor.rb
34
+ homepage: https://github.com/aleksandrs-ledovskis/nameday-vvc-pdf-extractor
35
+ licenses:
36
+ - BSD-3-Clause
37
+ metadata: {}
38
+ post_install_message:
39
+ rdoc_options: []
40
+ require_paths:
41
+ - lib
42
+ required_ruby_version: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '2.0'
47
+ required_rubygems_version: !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ version: '0'
52
+ requirements: []
53
+ rubyforge_project:
54
+ rubygems_version: 2.7.6
55
+ signing_key:
56
+ specification_version: 4
57
+ summary: Nameday data extraction from Valsts valodas centrs PDF
58
+ test_files: []