diabible-parser 0.1.9 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 285570dffa4f4b3492b5f7be27719b19fe8196d50cbb45a193691d4337dda79d
4
- data.tar.gz: 9298c266d811c7703b82cd027a22797846999494760efdc562b3b8f0130415c3
3
+ metadata.gz: f864fedb03063a06330cbd7e774288991bcc2aaecd718765ab0630bdf82f5635
4
+ data.tar.gz: e90ebc75f456a7253effff38b109bff9ea75f0b890c276cc333a2e0dd20b5b08
5
5
  SHA512:
6
- metadata.gz: 3c7c5d92029e832189797bb5776669b05bc294cee0c558a8809c8bdbe1401eb8bafbabdfc927e6e721728c2d27f5e3356979c34a33adf36f18d3de4a988f369a
7
- data.tar.gz: ca8d01a2932ca13ffcc3a1f9f6ff6e51b8008abf520c5a8d21dc670ae4632798ee20b595bd72aae77391bce0ab3e98fef173cebf7ed8f03a72d5923e5b0bc92b
6
+ metadata.gz: 38a09e6764a4dd0f34fa41791e6d7e5835960177fe8fc0b0576c878a8e2ece210e783bdc08c1e4a83c0ea246cf185d875ed7f71071960363214114aa61593188
7
+ data.tar.gz: 5802a91df97212ab7489204ad3c7f1017279076dfb59b89f8b6985cd359b20bf144dc9c9d8094779ddfc1f25a92cd0975830535617695f6bcfcd284935445587
@@ -29,11 +29,11 @@ module Diabible
29
29
 
30
30
  class Document
31
31
  include Helpers::Info
32
- include Helpers::Notes
32
+ include Helpers::Metadata
33
33
  include Helpers::Text
34
34
 
35
35
  # default constructor
36
- def initialize(document, info: true, notes: true, text: true, limit: 0)
36
+ def initialize(document, info: true, text: true, limit: 0)
37
37
  Zip::File.open(URI.open(document)) do |zip_file|
38
38
  # open content xml
39
39
  @document = Nokogiri::XML(zip_file.read("word/document.xml"))
@@ -46,7 +46,6 @@ module Diabible
46
46
  @limit = limit
47
47
  # parse
48
48
  parse_info if info
49
- parse_notes if notes
50
49
  parse_text if text
51
50
  end
52
51
 
@@ -71,19 +70,9 @@ module Diabible
71
70
 
72
71
  private
73
72
 
74
- def info_count
75
- @info_count ||= 0
76
- @info_count += 1
77
- end
78
-
79
- def note_count
80
- @note_count ||= 0
81
- @note_count += 1
82
- end
83
-
84
- def text_count
85
- @text_count ||= 0
86
- @text_count += 1
73
+ def count
74
+ @count ||= 0
75
+ @count += 1
87
76
  end
88
77
 
89
78
  # get text content of all nodes matched by xpath
@@ -97,18 +86,6 @@ module Diabible
97
86
  # return container
98
87
  content
99
88
  end
100
-
101
- # add meta hash
102
- def add_metadata(name, type, value, position = nil)
103
- # firstly check if the last field is not a same type
104
- if @metadata.last and @metadata.last[:type] != PARAGRAPH and @metadata.last[:type] == type
105
- # merge fields together
106
- @metadata.last[:value] += value
107
- else
108
- # normally add a new meta hash with uniqe namee
109
- @metadata << { name: "#{name}_#{send("#{name}_count")}_#{type}", type: type, value: value, position: position }
110
- end
111
- end
112
89
  end
113
90
  end
114
91
  end
@@ -19,8 +19,9 @@
19
19
  # Authors: Michal Mocnak <michal@marigan.net>
20
20
  #
21
21
 
22
+ require 'diabible/parser/helpers/constants'
22
23
  require 'diabible/parser/helpers/info'
23
- require 'diabible/parser/helpers/notes'
24
+ require 'diabible/parser/helpers/metadata'
24
25
  require 'diabible/parser/helpers/text'
25
26
 
26
27
  module Diabible
@@ -22,28 +22,10 @@
22
22
  module Diabible
23
23
  module Parser
24
24
  module Helpers
25
- module Notes
25
+ module Constants
26
26
 
27
- private
28
-
29
- # parse notes box
30
- def parse_notes
31
- # iterate through all paragraphs with style
32
- @document.xpath("//body/p").each do |p|
33
- # store pStyle value
34
- pStyle = p.xpath(".//pStyle/@val")
35
- # process only those with pstyle
36
- unless pStyle.empty? or ['Hlavicka', 'Incipit', 'Podnadpis', 'Podnadpis1'].include?(pStyle.first.value)
37
- # iterate over r blocks
38
- p.xpath(".//r").each do |r|
39
- name = pStyle.first.value
40
- value = content(r, ".//t")
41
- # add into container
42
- add_metadata("note", name, value)
43
- end
44
- end
45
- end
46
- end
27
+ NONE = 'none'
28
+ PARAGRAPH = 'paragraph'
47
29
  end
48
30
  end
49
31
  end
@@ -0,0 +1,69 @@
1
+ #
2
+ # Copyright (C) 2021 diabible.com
3
+ #
4
+ # This file is part of Diabible Parser.
5
+ #
6
+ # Diabible Parser is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License as published by
8
+ # the Free Software Foundation, either version 3 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # Diabible Parser is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with Diabible Parser. If not, see <http://www.gnu.org/licenses/>.
18
+ #
19
+ # Authors: Michal Mocnak <michal@marigan.net>
20
+ #
21
+
22
+ module Diabible
23
+ module Parser
24
+ module Helpers
25
+ module Metadata
26
+
27
+ # add meta hash
28
+ def add_metadata(group, type, value, position = nil)
29
+ # get index
30
+ index = count
31
+ # normally add a new meta hash with uniqe name
32
+ @metadata << { name: "#{index}_#{group}_#{type}", group: group, type: type, index: index, value: value, position: position }
33
+ end
34
+
35
+ # get metadata array per group or type
36
+ def get_metadata(group: nil, type: nil, filter: :exclusive, filters: [])
37
+ if group and type
38
+ return metadata_query(filter: filter, filters: filters) do |m|
39
+ m[:group] == group and m[:type] == type
40
+ end
41
+ elsif group
42
+ return metadata_query(filter: filter, filters: filters) do |m|
43
+ m[:group] == group
44
+ end
45
+ elsif type
46
+ return metadata_query(filter: filter, filters: filters) do |m|
47
+ m[:type] == type
48
+ end
49
+ end
50
+ # empty when nothing selected
51
+ return []
52
+ end
53
+
54
+ def metadata_query(filter: :exclusive, filters: [])
55
+ return @metadata.select do |m|
56
+ if filters.empty?
57
+ yield m
58
+ else
59
+ # get filters intersection
60
+ filtered = filters & [m[:type], m[:group]]
61
+ # resolution
62
+ (yield m) and (filter == :exclusive ? filtered.empty? : !filtered.empty?)
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
@@ -19,49 +19,50 @@
19
19
  # Authors: Michal Mocnak <michal@marigan.net>
20
20
  #
21
21
 
22
+ require 'diabible/parser/helpers/constants'
23
+
22
24
  module Diabible
23
25
  module Parser
24
26
  module Helpers
25
27
  module Text
28
+ include Diabible::Parser::Helpers::Constants
26
29
 
27
30
  private
28
31
 
29
- # paragraph type id
30
- PARAGRAPH = 'paragraph'
31
-
32
32
  # parse main text entries
33
33
  def parse_text
34
34
  # iterate through all paragraphs without style
35
35
  @document.xpath("//body/p").each do |p|
36
36
  # store pStyle value
37
37
  pStyle = p.xpath(".//pStyle/@val")
38
- # process only those without pstyle
39
- if pStyle.empty? or (pStyle.size and ['Incipit', 'Podnadpis', 'Podnadpis1'].include?(pStyle.first.value))
40
- # position in for paragraph
41
- paragraph_in = @text.length
42
- # iterate over r blocks
43
- p.xpath(".//r").each do |r|
44
- # store rstyle value
45
- rStyle = r.xpath(".//rStyle/@val")
46
- # if empty then it's a plain text without style
47
- if rStyle.empty? or rStyle.first.value === 'Text'
48
- # update text object
49
- @text += content(r, ".//t")
50
- else
51
- # otherwise create metadata field
52
- name = rStyle.first.value
53
- value = content(r, ".//t")
54
- # add into container
55
- add_metadata("text", name, value, { in: @text.length })
56
- end
57
- end
58
- # position out for paragraph
59
- paragraph_out = @text.length
60
- # filter paragraphs without content
61
- if paragraph_in != paragraph_out
62
- add_metadata("text", PARAGRAPH, nil, { in: paragraph_in, out: paragraph_out })
38
+ # prepare group
39
+ group = pStyle.first.value unless pStyle.empty?
40
+ group ||= NONE
41
+ # position in for paragraph against plain text
42
+ paragraph_in = @text.length
43
+ # iterate over r blocks
44
+ p.xpath(".//r").each do |r|
45
+ # store rstyle value
46
+ rStyle = r.xpath(".//rStyle/@val")
47
+ # if empty then it's a plain text without style
48
+ if (rStyle.empty? or rStyle.first.value === 'Text') and pStyle.empty?
49
+ # update text object
50
+ @text += content(r, ".//t")
51
+ else
52
+ # otherwise create metadata field
53
+ # prepare type
54
+ type = rStyle.first.value unless rStyle.empty?
55
+ type ||= NONE
56
+ # prepare value
57
+ value = content(r, ".//t")
58
+ # add into container
59
+ add_metadata(group, type, value, { in: @text.length })
63
60
  end
64
61
  end
62
+ # position out for paragraph
63
+ paragraph_out = @text.length
64
+ # filter paragraphs without content
65
+ add_metadata(PARAGRAPH, group, NONE, { in: paragraph_in, out: paragraph_out })
65
66
  # limit check
66
67
  if @limit > 0
67
68
  if @metadata.size >= @limit
@@ -21,6 +21,6 @@
21
21
 
22
22
  module Diabible
23
23
  module Parser
24
- VERSION = "0.1.9"
24
+ VERSION = "0.3.0"
25
25
  end
26
26
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: diabible-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michal Mocnak
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-02 00:00:00.000000000 Z
11
+ date: 2021-03-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -65,8 +65,9 @@ files:
65
65
  - lib/diabible/parser.rb
66
66
  - lib/diabible/parser/document.rb
67
67
  - lib/diabible/parser/helpers.rb
68
+ - lib/diabible/parser/helpers/constants.rb
68
69
  - lib/diabible/parser/helpers/info.rb
69
- - lib/diabible/parser/helpers/notes.rb
70
+ - lib/diabible/parser/helpers/metadata.rb
70
71
  - lib/diabible/parser/helpers/text.rb
71
72
  - lib/diabible/parser/version.rb
72
73
  - spec/spec_helper.rb