nameday_vvc_pdf_extractor 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/nameday_vvc_pdf_extractor.rb +87 -0
- metadata +58 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 9fe97f35bc0664cff59f3654a269aded15746d1ea09d54de83a2226cc32ef65e
|
4
|
+
data.tar.gz: c4bffc353a1ef7194f598bf5201c38f0e1363c2166c80cde6b7e9e6f14e3707f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 953a5af7e247352891d97af1c72090bb40465590315dec2570e2a3b11abd4ee22315de820e729eef0551b5e3745e5d7dceaef8b0f3e784bb9b7cece0b87ed4e6
|
7
|
+
data.tar.gz: 18088a356cd9d77766012de04b536002503412f157fbdab41dbacf31a05f3f02f0ae0369b0e77e283d8ec9304096cbcc1aeabf5fcdadda1deee9b650555069e9
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "pdf-reader"
|
4
|
+
require "date"
|
5
|
+
|
6
|
+
module Nameday
|
7
|
+
class VvcPdfExtractor
|
8
|
+
EMPTY_NAMEDAY_REGEXP = /\p{Pd}/ # Unicode category "Punctuation: Dash"
|
9
|
+
TEXT_ROW_DELIMITER = "\n"
|
10
|
+
MONTH_NAMES = %w[
|
11
|
+
-
|
12
|
+
JANVĀRIS
|
13
|
+
FEBRUĀRIS
|
14
|
+
MARTS
|
15
|
+
APRĪLIS
|
16
|
+
MAIJS
|
17
|
+
JŪNIJS
|
18
|
+
JŪLIJS
|
19
|
+
AUGUSTS
|
20
|
+
SEPTEMBRIS
|
21
|
+
OKTOBRIS
|
22
|
+
NOVEMBRIS
|
23
|
+
DECEMBRIS
|
24
|
+
].freeze
|
25
|
+
|
26
|
+
attr_reader :output
|
27
|
+
|
28
|
+
def initialize
|
29
|
+
@output = {}
|
30
|
+
end
|
31
|
+
|
32
|
+
def read_pdf(file_name)
|
33
|
+
@pdf_reader = ::PDF::Reader.new(file_name)
|
34
|
+
end
|
35
|
+
|
36
|
+
def extract
|
37
|
+
raise("PDF not opened!") unless @pdf_reader
|
38
|
+
process_pdf
|
39
|
+
output
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def prepare_output
|
45
|
+
@output = {}
|
46
|
+
|
47
|
+
# 2016 was leap year
|
48
|
+
Date.new(2016).step(Date.new(2017)) do |date|
|
49
|
+
@output[date.month] ||= {}
|
50
|
+
@output[date.month][date.day] = nil
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def process_pdf
|
55
|
+
return unless @output == {}
|
56
|
+
prepare_output
|
57
|
+
|
58
|
+
@current_month_index = nil
|
59
|
+
@pdf_reader.pages.each do |page|
|
60
|
+
process_pdf_page(page)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def process_pdf_page(pdf_page)
|
65
|
+
text_rows = pdf_page.text.split(TEXT_ROW_DELIMITER).map!(&:strip)
|
66
|
+
text_rows.each do |text_row|
|
67
|
+
next if text_row.empty?
|
68
|
+
|
69
|
+
if (new_month_index = MONTH_NAMES.index(text_row))
|
70
|
+
@current_month_index = new_month_index
|
71
|
+
elsif text_row.match?(/^\d+\./)
|
72
|
+
process_nameday_value(text_row)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def process_nameday_value(text_row)
|
78
|
+
nameday_data = text_row.split(" ")
|
79
|
+
day = nameday_data[0].to_i # Ignores ending dot
|
80
|
+
nameday_data[1..-1].each do |name|
|
81
|
+
next if name.match?(EMPTY_NAMEDAY_REGEXP)
|
82
|
+
@output[@current_month_index][day] ||= []
|
83
|
+
@output[@current_month_index][day] << name.tr(",", "")
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
metadata
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: nameday_vvc_pdf_extractor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Aleksandrs Ļedovskis
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2018-10-06 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: pdf-reader
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.1'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.1'
|
27
|
+
description:
|
28
|
+
email: aleksandrs@ledovskis.lv
|
29
|
+
executables: []
|
30
|
+
extensions: []
|
31
|
+
extra_rdoc_files: []
|
32
|
+
files:
|
33
|
+
- lib/nameday_vvc_pdf_extractor.rb
|
34
|
+
homepage: https://github.com/aleksandrs-ledovskis/nameday-vvc-pdf-extractor
|
35
|
+
licenses:
|
36
|
+
- BSD-3-Clause
|
37
|
+
metadata: {}
|
38
|
+
post_install_message:
|
39
|
+
rdoc_options: []
|
40
|
+
require_paths:
|
41
|
+
- lib
|
42
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - "~>"
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '2.0'
|
47
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
48
|
+
requirements:
|
49
|
+
- - ">="
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
version: '0'
|
52
|
+
requirements: []
|
53
|
+
rubyforge_project:
|
54
|
+
rubygems_version: 2.7.6
|
55
|
+
signing_key:
|
56
|
+
specification_version: 4
|
57
|
+
summary: Nameday data extraction from Valsts valodas centrs PDF
|
58
|
+
test_files: []
|