nameday_vvc_pdf_extractor 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/nameday_vvc_pdf_extractor.rb +18 -8
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d4e37438d8abad8fdd52b4cfceacb64358dc2df629ba34f5c1c3b0ebb9673e5f
|
4
|
+
data.tar.gz: e1c2b0e924bf98d906bca47bca2babf889dfe2992778bf372ed3a1134f8e9d1b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fcfc82617988a038a37e31495e33e74a54809dbeffe0283b326b2ac0c2feaea4348bc4899c72dd89a0d8561b4efd16980cfb9b2a97b6218157e5ad1dc9f86831
|
7
|
+
data.tar.gz: 1e5c09a644d3efca24b87b9f3672eb18c81ed97f8ad2d43b720b6a5025277c4f5bd539566682b1dca9469d2d8b1dc5936305d8576cbee72ac194d523015c54c2
|
@@ -4,6 +4,8 @@ require "pdf-reader"
|
|
4
4
|
require "date"
|
5
5
|
|
6
6
|
module Nameday
|
7
|
+
# Use this class to extract structured nameday information
|
8
|
+
# from pre-existing VVC PDF file
|
7
9
|
class VvcPdfExtractor
|
8
10
|
EMPTY_NAMEDAY_REGEXP = /\p{Pd}/ # Unicode category "Punctuation: Dash"
|
9
11
|
TEXT_ROW_DELIMITER = "\n"
|
@@ -26,6 +28,7 @@ module Nameday
|
|
26
28
|
attr_reader :output
|
27
29
|
|
28
30
|
def initialize
|
31
|
+
@pdf_reader = nil
|
29
32
|
@output = {}
|
30
33
|
end
|
31
34
|
|
@@ -35,6 +38,7 @@ module Nameday
|
|
35
38
|
|
36
39
|
def extract
|
37
40
|
raise("PDF not opened!") unless @pdf_reader
|
41
|
+
|
38
42
|
process_pdf
|
39
43
|
output
|
40
44
|
end
|
@@ -53,24 +57,29 @@ module Nameday
|
|
53
57
|
|
54
58
|
def process_pdf
|
55
59
|
return unless @output == {}
|
60
|
+
|
56
61
|
prepare_output
|
57
62
|
|
58
63
|
@current_month_index = nil
|
59
|
-
@pdf_reader.pages.each do |
|
60
|
-
process_pdf_page(
|
64
|
+
@pdf_reader.pages.each do |pdf_page|
|
65
|
+
process_pdf_page(pdf_page)
|
61
66
|
end
|
62
67
|
end
|
63
68
|
|
64
69
|
def process_pdf_page(pdf_page)
|
65
|
-
text_rows = pdf_page.text.split(TEXT_ROW_DELIMITER).map
|
70
|
+
text_rows = pdf_page.text.split(TEXT_ROW_DELIMITER).map(&:strip)
|
66
71
|
text_rows.each do |text_row|
|
67
72
|
next if text_row.empty?
|
68
73
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
+
process_text_row(text_row)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def process_text_row(text_row)
|
79
|
+
if (new_month_index = MONTH_NAMES.index(text_row))
|
80
|
+
@current_month_index = new_month_index
|
81
|
+
elsif text_row.match?(/^\d+\./)
|
82
|
+
process_nameday_value(text_row)
|
74
83
|
end
|
75
84
|
end
|
76
85
|
|
@@ -80,6 +89,7 @@ module Nameday
|
|
80
89
|
|
81
90
|
nameday_data[1].split(",").each do |name|
|
82
91
|
next if name.match?(EMPTY_NAMEDAY_REGEXP)
|
92
|
+
|
83
93
|
@output[@current_month_index][day] ||= []
|
84
94
|
@output[@current_month_index][day] << name.strip
|
85
95
|
end
|