diabible-parser 0.1.9 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 285570dffa4f4b3492b5f7be27719b19fe8196d50cbb45a193691d4337dda79d
4
- data.tar.gz: 9298c266d811c7703b82cd027a22797846999494760efdc562b3b8f0130415c3
3
+ metadata.gz: f864fedb03063a06330cbd7e774288991bcc2aaecd718765ab0630bdf82f5635
4
+ data.tar.gz: e90ebc75f456a7253effff38b109bff9ea75f0b890c276cc333a2e0dd20b5b08
5
5
  SHA512:
6
- metadata.gz: 3c7c5d92029e832189797bb5776669b05bc294cee0c558a8809c8bdbe1401eb8bafbabdfc927e6e721728c2d27f5e3356979c34a33adf36f18d3de4a988f369a
7
- data.tar.gz: ca8d01a2932ca13ffcc3a1f9f6ff6e51b8008abf520c5a8d21dc670ae4632798ee20b595bd72aae77391bce0ab3e98fef173cebf7ed8f03a72d5923e5b0bc92b
6
+ metadata.gz: 38a09e6764a4dd0f34fa41791e6d7e5835960177fe8fc0b0576c878a8e2ece210e783bdc08c1e4a83c0ea246cf185d875ed7f71071960363214114aa61593188
7
+ data.tar.gz: 5802a91df97212ab7489204ad3c7f1017279076dfb59b89f8b6985cd359b20bf144dc9c9d8094779ddfc1f25a92cd0975830535617695f6bcfcd284935445587
@@ -29,11 +29,11 @@ module Diabible
29
29
 
30
30
  class Document
31
31
  include Helpers::Info
32
- include Helpers::Notes
32
+ include Helpers::Metadata
33
33
  include Helpers::Text
34
34
 
35
35
  # default constructor
36
- def initialize(document, info: true, notes: true, text: true, limit: 0)
36
+ def initialize(document, info: true, text: true, limit: 0)
37
37
  Zip::File.open(URI.open(document)) do |zip_file|
38
38
  # open content xml
39
39
  @document = Nokogiri::XML(zip_file.read("word/document.xml"))
@@ -46,7 +46,6 @@ module Diabible
46
46
  @limit = limit
47
47
  # parse
48
48
  parse_info if info
49
- parse_notes if notes
50
49
  parse_text if text
51
50
  end
52
51
 
@@ -71,19 +70,9 @@ module Diabible
71
70
 
72
71
  private
73
72
 
74
- def info_count
75
- @info_count ||= 0
76
- @info_count += 1
77
- end
78
-
79
- def note_count
80
- @note_count ||= 0
81
- @note_count += 1
82
- end
83
-
84
- def text_count
85
- @text_count ||= 0
86
- @text_count += 1
73
+ def count
74
+ @count ||= 0
75
+ @count += 1
87
76
  end
88
77
 
89
78
  # get text content of all nodes matched by xpath
@@ -97,18 +86,6 @@ module Diabible
97
86
  # return container
98
87
  content
99
88
  end
100
-
101
- # add meta hash
102
- def add_metadata(name, type, value, position = nil)
103
- # firstly check if the last field is not a same type
104
- if @metadata.last and @metadata.last[:type] != PARAGRAPH and @metadata.last[:type] == type
105
- # merge fields together
106
- @metadata.last[:value] += value
107
- else
108
- # normally add a new meta hash with uniqe namee
109
- @metadata << { name: "#{name}_#{send("#{name}_count")}_#{type}", type: type, value: value, position: position }
110
- end
111
- end
112
89
  end
113
90
  end
114
91
  end
@@ -19,8 +19,9 @@
19
19
  # Authors: Michal Mocnak <michal@marigan.net>
20
20
  #
21
21
 
22
+ require 'diabible/parser/helpers/constants'
22
23
  require 'diabible/parser/helpers/info'
23
- require 'diabible/parser/helpers/notes'
24
+ require 'diabible/parser/helpers/metadata'
24
25
  require 'diabible/parser/helpers/text'
25
26
 
26
27
  module Diabible
@@ -22,28 +22,10 @@
22
22
  module Diabible
23
23
  module Parser
24
24
  module Helpers
25
- module Notes
25
+ module Constants
26
26
 
27
- private
28
-
29
- # parse notes box
30
- def parse_notes
31
- # iterate through all paragraphs with style
32
- @document.xpath("//body/p").each do |p|
33
- # store pStyle value
34
- pStyle = p.xpath(".//pStyle/@val")
35
- # process only those with pstyle
36
- unless pStyle.empty? or ['Hlavicka', 'Incipit', 'Podnadpis', 'Podnadpis1'].include?(pStyle.first.value)
37
- # iterate over r blocks
38
- p.xpath(".//r").each do |r|
39
- name = pStyle.first.value
40
- value = content(r, ".//t")
41
- # add into container
42
- add_metadata("note", name, value)
43
- end
44
- end
45
- end
46
- end
27
+ NONE = 'none'
28
+ PARAGRAPH = 'paragraph'
47
29
  end
48
30
  end
49
31
  end
@@ -0,0 +1,69 @@
1
+ #
2
+ # Copyright (C) 2021 diabible.com
3
+ #
4
+ # This file is part of Diabible Parser.
5
+ #
6
+ # Diabible Parser is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License as published by
8
+ # the Free Software Foundation, either version 3 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # Diabible Parser is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with Diabible Parser. If not, see <http://www.gnu.org/licenses/>.
18
+ #
19
+ # Authors: Michal Mocnak <michal@marigan.net>
20
+ #
21
+
22
+ module Diabible
23
+ module Parser
24
+ module Helpers
25
+ module Metadata
26
+
27
+ # add meta hash
28
+ def add_metadata(group, type, value, position = nil)
29
+ # get index
30
+ index = count
31
+ # normally add a new meta hash with uniqe name
32
+ @metadata << { name: "#{index}_#{group}_#{type}", group: group, type: type, index: index, value: value, position: position }
33
+ end
34
+
35
+ # get metadata array per group or type
36
+ def get_metadata(group: nil, type: nil, filter: :exclusive, filters: [])
37
+ if group and type
38
+ return metadata_query(filter: filter, filters: filters) do |m|
39
+ m[:group] == group and m[:type] == type
40
+ end
41
+ elsif group
42
+ return metadata_query(filter: filter, filters: filters) do |m|
43
+ m[:group] == group
44
+ end
45
+ elsif type
46
+ return metadata_query(filter: filter, filters: filters) do |m|
47
+ m[:type] == type
48
+ end
49
+ end
50
+ # empty when nothing selected
51
+ return []
52
+ end
53
+
54
+ def metadata_query(filter: :exclusive, filters: [])
55
+ return @metadata.select do |m|
56
+ if filters.empty?
57
+ yield m
58
+ else
59
+ # get filters intersection
60
+ filtered = filters & [m[:type], m[:group]]
61
+ # resolution
62
+ (yield m) and (filter == :exclusive ? filtered.empty? : !filtered.empty?)
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
@@ -19,49 +19,50 @@
19
19
  # Authors: Michal Mocnak <michal@marigan.net>
20
20
  #
21
21
 
22
+ require 'diabible/parser/helpers/constants'
23
+
22
24
  module Diabible
23
25
  module Parser
24
26
  module Helpers
25
27
  module Text
28
+ include Diabible::Parser::Helpers::Constants
26
29
 
27
30
  private
28
31
 
29
- # paragraph type id
30
- PARAGRAPH = 'paragraph'
31
-
32
32
  # parse main text entries
33
33
  def parse_text
34
34
  # iterate through all paragraphs without style
35
35
  @document.xpath("//body/p").each do |p|
36
36
  # store pStyle value
37
37
  pStyle = p.xpath(".//pStyle/@val")
38
- # process only those without pstyle
39
- if pStyle.empty? or (pStyle.size and ['Incipit', 'Podnadpis', 'Podnadpis1'].include?(pStyle.first.value))
40
- # position in for paragraph
41
- paragraph_in = @text.length
42
- # iterate over r blocks
43
- p.xpath(".//r").each do |r|
44
- # store rstyle value
45
- rStyle = r.xpath(".//rStyle/@val")
46
- # if empty then it's a plain text without style
47
- if rStyle.empty? or rStyle.first.value === 'Text'
48
- # update text object
49
- @text += content(r, ".//t")
50
- else
51
- # otherwise create metadata field
52
- name = rStyle.first.value
53
- value = content(r, ".//t")
54
- # add into container
55
- add_metadata("text", name, value, { in: @text.length })
56
- end
57
- end
58
- # position out for paragraph
59
- paragraph_out = @text.length
60
- # filter paragraphs without content
61
- if paragraph_in != paragraph_out
62
- add_metadata("text", PARAGRAPH, nil, { in: paragraph_in, out: paragraph_out })
38
+ # prepare group
39
+ group = pStyle.first.value unless pStyle.empty?
40
+ group ||= NONE
41
+ # position in for paragraph against plain text
42
+ paragraph_in = @text.length
43
+ # iterate over r blocks
44
+ p.xpath(".//r").each do |r|
45
+ # store rstyle value
46
+ rStyle = r.xpath(".//rStyle/@val")
47
+ # if empty then it's a plain text without style
48
+ if (rStyle.empty? or rStyle.first.value === 'Text') and pStyle.empty?
49
+ # update text object
50
+ @text += content(r, ".//t")
51
+ else
52
+ # otherwise create metadata field
53
+ # prepare type
54
+ type = rStyle.first.value unless rStyle.empty?
55
+ type ||= NONE
56
+ # prepare value
57
+ value = content(r, ".//t")
58
+ # add into container
59
+ add_metadata(group, type, value, { in: @text.length })
63
60
  end
64
61
  end
62
+ # position out for paragraph
63
+ paragraph_out = @text.length
64
+ # filter paragraphs without content
65
+ add_metadata(PARAGRAPH, group, NONE, { in: paragraph_in, out: paragraph_out })
65
66
  # limit check
66
67
  if @limit > 0
67
68
  if @metadata.size >= @limit
@@ -21,6 +21,6 @@
21
21
 
22
22
  module Diabible
23
23
  module Parser
24
- VERSION = "0.1.9"
24
+ VERSION = "0.3.0"
25
25
  end
26
26
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: diabible-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michal Mocnak
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-02 00:00:00.000000000 Z
11
+ date: 2021-03-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -65,8 +65,9 @@ files:
65
65
  - lib/diabible/parser.rb
66
66
  - lib/diabible/parser/document.rb
67
67
  - lib/diabible/parser/helpers.rb
68
+ - lib/diabible/parser/helpers/constants.rb
68
69
  - lib/diabible/parser/helpers/info.rb
69
- - lib/diabible/parser/helpers/notes.rb
70
+ - lib/diabible/parser/helpers/metadata.rb
70
71
  - lib/diabible/parser/helpers/text.rb
71
72
  - lib/diabible/parser/version.rb
72
73
  - spec/spec_helper.rb