nameday_vvc_pdf_extractor 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/nameday_vvc_pdf_extractor.rb +18 -8
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d4e37438d8abad8fdd52b4cfceacb64358dc2df629ba34f5c1c3b0ebb9673e5f
|
4
|
+
data.tar.gz: e1c2b0e924bf98d906bca47bca2babf889dfe2992778bf372ed3a1134f8e9d1b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fcfc82617988a038a37e31495e33e74a54809dbeffe0283b326b2ac0c2feaea4348bc4899c72dd89a0d8561b4efd16980cfb9b2a97b6218157e5ad1dc9f86831
|
7
|
+
data.tar.gz: 1e5c09a644d3efca24b87b9f3672eb18c81ed97f8ad2d43b720b6a5025277c4f5bd539566682b1dca9469d2d8b1dc5936305d8576cbee72ac194d523015c54c2
|
@@ -4,6 +4,8 @@ require "pdf-reader"
|
|
4
4
|
require "date"
|
5
5
|
|
6
6
|
module Nameday
|
7
|
+
# Use this class to extract structured nameday information
|
8
|
+
# from pre-existing VVC PDF file
|
7
9
|
class VvcPdfExtractor
|
8
10
|
EMPTY_NAMEDAY_REGEXP = /\p{Pd}/ # Unicode category "Punctuation: Dash"
|
9
11
|
TEXT_ROW_DELIMITER = "\n"
|
@@ -26,6 +28,7 @@ module Nameday
|
|
26
28
|
attr_reader :output
|
27
29
|
|
28
30
|
def initialize
|
31
|
+
@pdf_reader = nil
|
29
32
|
@output = {}
|
30
33
|
end
|
31
34
|
|
@@ -35,6 +38,7 @@ module Nameday
|
|
35
38
|
|
36
39
|
def extract
|
37
40
|
raise("PDF not opened!") unless @pdf_reader
|
41
|
+
|
38
42
|
process_pdf
|
39
43
|
output
|
40
44
|
end
|
@@ -53,24 +57,29 @@ module Nameday
|
|
53
57
|
|
54
58
|
def process_pdf
|
55
59
|
return unless @output == {}
|
60
|
+
|
56
61
|
prepare_output
|
57
62
|
|
58
63
|
@current_month_index = nil
|
59
|
-
@pdf_reader.pages.each do |
|
60
|
-
process_pdf_page(
|
64
|
+
@pdf_reader.pages.each do |pdf_page|
|
65
|
+
process_pdf_page(pdf_page)
|
61
66
|
end
|
62
67
|
end
|
63
68
|
|
64
69
|
def process_pdf_page(pdf_page)
|
65
|
-
text_rows = pdf_page.text.split(TEXT_ROW_DELIMITER).map
|
70
|
+
text_rows = pdf_page.text.split(TEXT_ROW_DELIMITER).map(&:strip)
|
66
71
|
text_rows.each do |text_row|
|
67
72
|
next if text_row.empty?
|
68
73
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
+
process_text_row(text_row)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def process_text_row(text_row)
|
79
|
+
if (new_month_index = MONTH_NAMES.index(text_row))
|
80
|
+
@current_month_index = new_month_index
|
81
|
+
elsif text_row.match?(/^\d+\./)
|
82
|
+
process_nameday_value(text_row)
|
74
83
|
end
|
75
84
|
end
|
76
85
|
|
@@ -80,6 +89,7 @@ module Nameday
|
|
80
89
|
|
81
90
|
nameday_data[1].split(",").each do |name|
|
82
91
|
next if name.match?(EMPTY_NAMEDAY_REGEXP)
|
92
|
+
|
83
93
|
@output[@current_month_index][day] ||= []
|
84
94
|
@output[@current_month_index][day] << name.strip
|
85
95
|
end
|