diabible-parser 0.1.9 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/diabible/parser/document.rb +5 -28
- data/lib/diabible/parser/helpers.rb +2 -1
- data/lib/diabible/parser/helpers/{notes.rb → constants.rb} +3 -21
- data/lib/diabible/parser/helpers/metadata.rb +69 -0
- data/lib/diabible/parser/helpers/text.rb +29 -28
- data/lib/diabible/parser/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f864fedb03063a06330cbd7e774288991bcc2aaecd718765ab0630bdf82f5635
|
4
|
+
data.tar.gz: e90ebc75f456a7253effff38b109bff9ea75f0b890c276cc333a2e0dd20b5b08
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 38a09e6764a4dd0f34fa41791e6d7e5835960177fe8fc0b0576c878a8e2ece210e783bdc08c1e4a83c0ea246cf185d875ed7f71071960363214114aa61593188
|
7
|
+
data.tar.gz: 5802a91df97212ab7489204ad3c7f1017279076dfb59b89f8b6985cd359b20bf144dc9c9d8094779ddfc1f25a92cd0975830535617695f6bcfcd284935445587
|
@@ -29,11 +29,11 @@ module Diabible
|
|
29
29
|
|
30
30
|
class Document
|
31
31
|
include Helpers::Info
|
32
|
-
include Helpers::
|
32
|
+
include Helpers::Metadata
|
33
33
|
include Helpers::Text
|
34
34
|
|
35
35
|
# default constructor
|
36
|
-
def initialize(document, info: true,
|
36
|
+
def initialize(document, info: true, text: true, limit: 0)
|
37
37
|
Zip::File.open(URI.open(document)) do |zip_file|
|
38
38
|
# open content xml
|
39
39
|
@document = Nokogiri::XML(zip_file.read("word/document.xml"))
|
@@ -46,7 +46,6 @@ module Diabible
|
|
46
46
|
@limit = limit
|
47
47
|
# parse
|
48
48
|
parse_info if info
|
49
|
-
parse_notes if notes
|
50
49
|
parse_text if text
|
51
50
|
end
|
52
51
|
|
@@ -71,19 +70,9 @@ module Diabible
|
|
71
70
|
|
72
71
|
private
|
73
72
|
|
74
|
-
def
|
75
|
-
@
|
76
|
-
@
|
77
|
-
end
|
78
|
-
|
79
|
-
def note_count
|
80
|
-
@note_count ||= 0
|
81
|
-
@note_count += 1
|
82
|
-
end
|
83
|
-
|
84
|
-
def text_count
|
85
|
-
@text_count ||= 0
|
86
|
-
@text_count += 1
|
73
|
+
def count
|
74
|
+
@count ||= 0
|
75
|
+
@count += 1
|
87
76
|
end
|
88
77
|
|
89
78
|
# get text content of all nodes matched by xpath
|
@@ -97,18 +86,6 @@ module Diabible
|
|
97
86
|
# return container
|
98
87
|
content
|
99
88
|
end
|
100
|
-
|
101
|
-
# add meta hash
|
102
|
-
def add_metadata(name, type, value, position = nil)
|
103
|
-
# firstly check if the last field is not a same type
|
104
|
-
if @metadata.last and @metadata.last[:type] != PARAGRAPH and @metadata.last[:type] == type
|
105
|
-
# merge fields together
|
106
|
-
@metadata.last[:value] += value
|
107
|
-
else
|
108
|
-
# normally add a new meta hash with uniqe namee
|
109
|
-
@metadata << { name: "#{name}_#{send("#{name}_count")}_#{type}", type: type, value: value, position: position }
|
110
|
-
end
|
111
|
-
end
|
112
89
|
end
|
113
90
|
end
|
114
91
|
end
|
@@ -19,8 +19,9 @@
|
|
19
19
|
# Authors: Michal Mocnak <michal@marigan.net>
|
20
20
|
#
|
21
21
|
|
22
|
+
require 'diabible/parser/helpers/constants'
|
22
23
|
require 'diabible/parser/helpers/info'
|
23
|
-
require 'diabible/parser/helpers/
|
24
|
+
require 'diabible/parser/helpers/metadata'
|
24
25
|
require 'diabible/parser/helpers/text'
|
25
26
|
|
26
27
|
module Diabible
|
@@ -22,28 +22,10 @@
|
|
22
22
|
module Diabible
|
23
23
|
module Parser
|
24
24
|
module Helpers
|
25
|
-
module
|
25
|
+
module Constants
|
26
26
|
|
27
|
-
|
28
|
-
|
29
|
-
# parse notes box
|
30
|
-
def parse_notes
|
31
|
-
# iterate through all paragraphs with style
|
32
|
-
@document.xpath("//body/p").each do |p|
|
33
|
-
# store pStyle value
|
34
|
-
pStyle = p.xpath(".//pStyle/@val")
|
35
|
-
# process only those with pstyle
|
36
|
-
unless pStyle.empty? or ['Hlavicka', 'Incipit', 'Podnadpis', 'Podnadpis1'].include?(pStyle.first.value)
|
37
|
-
# iterate over r blocks
|
38
|
-
p.xpath(".//r").each do |r|
|
39
|
-
name = pStyle.first.value
|
40
|
-
value = content(r, ".//t")
|
41
|
-
# add into container
|
42
|
-
add_metadata("note", name, value)
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
27
|
+
NONE = 'none'
|
28
|
+
PARAGRAPH = 'paragraph'
|
47
29
|
end
|
48
30
|
end
|
49
31
|
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (C) 2021 diabible.com
|
3
|
+
#
|
4
|
+
# This file is part of Diabible Parser.
|
5
|
+
#
|
6
|
+
# Diabible Parser is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU General Public License as published by
|
8
|
+
# the Free Software Foundation, either version 3 of the License, or
|
9
|
+
# (at your option) any later version.
|
10
|
+
#
|
11
|
+
# Diabible Parser is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU General Public License
|
17
|
+
# along with Diabible Parser. If not, see <http://www.gnu.org/licenses/>.
|
18
|
+
#
|
19
|
+
# Authors: Michal Mocnak <michal@marigan.net>
|
20
|
+
#
|
21
|
+
|
22
|
+
module Diabible
|
23
|
+
module Parser
|
24
|
+
module Helpers
|
25
|
+
module Metadata
|
26
|
+
|
27
|
+
# add meta hash
|
28
|
+
def add_metadata(group, type, value, position = nil)
|
29
|
+
# get index
|
30
|
+
index = count
|
31
|
+
# normally add a new meta hash with uniqe name
|
32
|
+
@metadata << { name: "#{index}_#{group}_#{type}", group: group, type: type, index: index, value: value, position: position }
|
33
|
+
end
|
34
|
+
|
35
|
+
# get metadata array per group or type
|
36
|
+
def get_metadata(group: nil, type: nil, filter: :exclusive, filters: [])
|
37
|
+
if group and type
|
38
|
+
return metadata_query(filter: filter, filters: filters) do |m|
|
39
|
+
m[:group] == group and m[:type] == type
|
40
|
+
end
|
41
|
+
elsif group
|
42
|
+
return metadata_query(filter: filter, filters: filters) do |m|
|
43
|
+
m[:group] == group
|
44
|
+
end
|
45
|
+
elsif type
|
46
|
+
return metadata_query(filter: filter, filters: filters) do |m|
|
47
|
+
m[:type] == type
|
48
|
+
end
|
49
|
+
end
|
50
|
+
# empty when nothing selected
|
51
|
+
return []
|
52
|
+
end
|
53
|
+
|
54
|
+
def metadata_query(filter: :exclusive, filters: [])
|
55
|
+
return @metadata.select do |m|
|
56
|
+
if filters.empty?
|
57
|
+
yield m
|
58
|
+
else
|
59
|
+
# get filters intersection
|
60
|
+
filtered = filters & [m[:type], m[:group]]
|
61
|
+
# resolution
|
62
|
+
(yield m) and (filter == :exclusive ? filtered.empty? : !filtered.empty?)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -19,49 +19,50 @@
|
|
19
19
|
# Authors: Michal Mocnak <michal@marigan.net>
|
20
20
|
#
|
21
21
|
|
22
|
+
require 'diabible/parser/helpers/constants'
|
23
|
+
|
22
24
|
module Diabible
|
23
25
|
module Parser
|
24
26
|
module Helpers
|
25
27
|
module Text
|
28
|
+
include Diabible::Parser::Helpers::Constants
|
26
29
|
|
27
30
|
private
|
28
31
|
|
29
|
-
# paragraph type id
|
30
|
-
PARAGRAPH = 'paragraph'
|
31
|
-
|
32
32
|
# parse main text entries
|
33
33
|
def parse_text
|
34
34
|
# iterate through all paragraphs without style
|
35
35
|
@document.xpath("//body/p").each do |p|
|
36
36
|
# store pStyle value
|
37
37
|
pStyle = p.xpath(".//pStyle/@val")
|
38
|
-
#
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
# filter paragraphs without content
|
61
|
-
if paragraph_in != paragraph_out
|
62
|
-
add_metadata("text", PARAGRAPH, nil, { in: paragraph_in, out: paragraph_out })
|
38
|
+
# prepare group
|
39
|
+
group = pStyle.first.value unless pStyle.empty?
|
40
|
+
group ||= NONE
|
41
|
+
# position in for paragraph against plain text
|
42
|
+
paragraph_in = @text.length
|
43
|
+
# iterate over r blocks
|
44
|
+
p.xpath(".//r").each do |r|
|
45
|
+
# store rstyle value
|
46
|
+
rStyle = r.xpath(".//rStyle/@val")
|
47
|
+
# if empty then it's a plain text without style
|
48
|
+
if (rStyle.empty? or rStyle.first.value === 'Text') and pStyle.empty?
|
49
|
+
# update text object
|
50
|
+
@text += content(r, ".//t")
|
51
|
+
else
|
52
|
+
# otherwise create metadata field
|
53
|
+
# prepare type
|
54
|
+
type = rStyle.first.value unless rStyle.empty?
|
55
|
+
type ||= NONE
|
56
|
+
# prepare value
|
57
|
+
value = content(r, ".//t")
|
58
|
+
# add into container
|
59
|
+
add_metadata(group, type, value, { in: @text.length })
|
63
60
|
end
|
64
61
|
end
|
62
|
+
# position out for paragraph
|
63
|
+
paragraph_out = @text.length
|
64
|
+
# filter paragraphs without content
|
65
|
+
add_metadata(PARAGRAPH, group, NONE, { in: paragraph_in, out: paragraph_out })
|
65
66
|
# limit check
|
66
67
|
if @limit > 0
|
67
68
|
if @metadata.size >= @limit
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: diabible-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michal Mocnak
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-03-
|
11
|
+
date: 2021-03-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -65,8 +65,9 @@ files:
|
|
65
65
|
- lib/diabible/parser.rb
|
66
66
|
- lib/diabible/parser/document.rb
|
67
67
|
- lib/diabible/parser/helpers.rb
|
68
|
+
- lib/diabible/parser/helpers/constants.rb
|
68
69
|
- lib/diabible/parser/helpers/info.rb
|
69
|
-
- lib/diabible/parser/helpers/
|
70
|
+
- lib/diabible/parser/helpers/metadata.rb
|
70
71
|
- lib/diabible/parser/helpers/text.rb
|
71
72
|
- lib/diabible/parser/version.rb
|
72
73
|
- spec/spec_helper.rb
|