diabible-parser 0.1.6 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4890da70174e919b49e385c291425fc5c2941230f410107b41aede336ca931ef
4
- data.tar.gz: 5f97266c8dcd64e27a8a208fca5d8239abfd446c91a3a4cf6d12bfa5a83df79c
3
+ metadata.gz: c8b00b4ed3517b91ce76c3d217e46ab5e3a7a43c95776517d75edd2102661f46
4
+ data.tar.gz: 1ce6691990f539cc0fd2841400e03d29f6d32019f9f7ecd3f0560180896281a9
5
5
  SHA512:
6
- metadata.gz: dabfdfaaf2e7a256d9cdc94d29a68973602f9a254a55206dd35cc5ffcd710af27b3b6d0461c119ca4b2cad82cecdce85e01a385c931d9b9bda10ba09ff1d3ede
7
- data.tar.gz: c8545b7ffa872bb1161917be990d0c50297018512c30730e97f039bc920689cd1b9f5d6ec7d62e9b697b132a33bf87ac2185a047592973ee0535aacc26cefb51
6
+ metadata.gz: 4f6abc6a44871efb78474f513c9d79d217e26baec0033c89a51fca26c248df10c3d0ab6401c67cca656ccecdd8aad2f3d511df56135828e6593b755ebf2ecbcc
7
+ data.tar.gz: cab40cfebe6c0c7a3a03bb74c82bd4ce28caeb14ad820c20fcc252c5812c00c9eb3d654ab02b9e41b0e0fa6135031cd7b3bf7bdafafd44f2eae6b8ba15366064
@@ -29,11 +29,11 @@ module Diabible
29
29
 
30
30
  class Document
31
31
  include Helpers::Info
32
- include Helpers::Notes
32
+ include Helpers::Metadata
33
33
  include Helpers::Text
34
34
 
35
35
  # default constructor
36
- def initialize(document, info: true, notes: true, text: true)
36
+ def initialize(document, info: true, text: true, limit: 0)
37
37
  Zip::File.open(URI.open(document)) do |zip_file|
38
38
  # open content xml
39
39
  @document = Nokogiri::XML(zip_file.read("word/document.xml"))
@@ -43,9 +43,9 @@ module Diabible
43
43
  # defaults
44
44
  @text = ''
45
45
  @metadata = []
46
+ @limit = limit
46
47
  # parse
47
48
  parse_info if info
48
- parse_notes if notes
49
49
  parse_text if text
50
50
  end
51
51
 
@@ -70,19 +70,9 @@ module Diabible
70
70
 
71
71
  private
72
72
 
73
- def info_count
74
- @info_count ||= 0
75
- @info_count += 1
76
- end
77
-
78
- def note_count
79
- @note_count ||= 0
80
- @note_count += 1
81
- end
82
-
83
- def text_count
84
- @text_count ||= 0
85
- @text_count += 1
73
+ def count
74
+ @count ||= 0
75
+ @count += 1
86
76
  end
87
77
 
88
78
  # get text content of all nodes matched by xpath
@@ -96,18 +86,6 @@ module Diabible
96
86
  # return container
97
87
  content
98
88
  end
99
-
100
- # add meta hash
101
- def add_metadata(name, type, value, position = nil)
102
- # firstly check if the last field is not a same type
103
- if @metadata.last and @metadata.last[:type] != PARAGRAPH and @metadata.last[:type] == type
104
- # merge fields together
105
- @metadata.last[:value] += value
106
- else
107
- # normally add a new meta hash with uniqe namee
108
- @metadata << { name: "#{name}_#{send("#{name}_count")}_#{type}", type: type, value: value, position: position }
109
- end
110
- end
111
89
  end
112
90
  end
113
91
  end
@@ -19,8 +19,9 @@
19
19
  # Authors: Michal Mocnak <michal@marigan.net>
20
20
  #
21
21
 
22
+ require 'diabible/parser/helpers/constants'
22
23
  require 'diabible/parser/helpers/info'
23
- require 'diabible/parser/helpers/notes'
24
+ require 'diabible/parser/helpers/metadata'
24
25
  require 'diabible/parser/helpers/text'
25
26
 
26
27
  module Diabible
@@ -22,28 +22,10 @@
22
22
  module Diabible
23
23
  module Parser
24
24
  module Helpers
25
- module Notes
25
+ module Constants
26
26
 
27
- private
28
-
29
- # parse notes box
30
- def parse_notes
31
- # iterate through all paragraphs with style
32
- @document.xpath("//body/p").each do |p|
33
- # store pStyle value
34
- pStyle = p.xpath(".//pStyle/@val")
35
- # process only those with pstyle
36
- unless pStyle.empty? or ['Hlavicka', 'Incipit', 'Podnadpis', 'Podnadpis1'].include?(pStyle.first.value)
37
- # iterate over r blocks
38
- p.xpath(".//r").each do |r|
39
- name = pStyle.first.value
40
- value = content(r, ".//t")
41
- # add into container
42
- add_metadata("note", name, value)
43
- end
44
- end
45
- end
46
- end
27
+ NONE = 'none'
28
+ PARAGRAPH = 'paragraph'
47
29
  end
48
30
  end
49
31
  end
@@ -0,0 +1,71 @@
1
+ #
2
+ # Copyright (C) 2021 diabible.com
3
+ #
4
+ # This file is part of Diabible Parser.
5
+ #
6
+ # Diabible Parser is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License as published by
8
+ # the Free Software Foundation, either version 3 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # Diabible Parser is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with Diabible Parser. If not, see <http://www.gnu.org/licenses/>.
18
+ #
19
+ # Authors: Michal Mocnak <michal@marigan.net>
20
+ #
21
+
22
+ module Diabible
23
+ module Parser
24
+ module Helpers
25
+ module Metadata
26
+
27
+ # add meta hash
28
+ def add_metadata(group, type, value, position = nil)
29
+ # get index
30
+ index = count
31
+ # normally add a new meta hash with uniqe name
32
+ @metadata << { name: "#{index}_#{group}_#{type}", group: group, type: type, index: index, value: value, position: position }
33
+ end
34
+
35
+ # get metadata array per group or type
36
+ def get_metadata(group: nil, type: nil, filter: :exclusive, filters: [])
37
+ if group and type
38
+ return metadata_query(filter: filter, filters: filters) do |m|
39
+ m[:group] == group and m[:type] == type
40
+ end
41
+ elsif group
42
+ return metadata_query(filter: filter, filters: filters) do |m|
43
+ m[:group] == group
44
+ end
45
+ elsif type
46
+ return metadata_query(filter: filter, filters: filters) do |m|
47
+ m[:type] == type
48
+ end
49
+ else
50
+ return metadata_query(filter: filter, filters: filters) do |m|
51
+ true
52
+ end
53
+ end
54
+ end
55
+
56
+ def metadata_query(filter: :exclusive, filters: [])
57
+ return @metadata.select do |m|
58
+ if filters.empty?
59
+ yield m
60
+ else
61
+ # get filters intersection
62
+ filtered = filters & [m[:type], m[:group]]
63
+ # resolution
64
+ (yield m) and (filter == :exclusive ? filtered.empty? : !filtered.empty?)
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
@@ -19,47 +19,55 @@
19
19
  # Authors: Michal Mocnak <michal@marigan.net>
20
20
  #
21
21
 
22
+ require 'diabible/parser/helpers/constants'
23
+
22
24
  module Diabible
23
25
  module Parser
24
26
  module Helpers
25
27
  module Text
28
+ include Diabible::Parser::Helpers::Constants
26
29
 
27
30
  private
28
31
 
29
- # paragraph type id
30
- PARAGRAPH = 'paragraph'
31
-
32
32
  # parse main text entries
33
33
  def parse_text
34
34
  # iterate through all paragraphs without style
35
35
  @document.xpath("//body/p").each do |p|
36
36
  # store pStyle value
37
37
  pStyle = p.xpath(".//pStyle/@val")
38
- # process only those without pstyle
39
- if pStyle.empty? or (pStyle.size and ['Incipit', 'Podnadpis', 'Podnadpis1'].include?(pStyle.first.value))
40
- # position in for paragraph
41
- paragraph_in = @text.length
42
- # iterate over r blocks
43
- p.xpath(".//r").each do |r|
44
- # store rstyle value
45
- rStyle = r.xpath(".//rStyle/@val")
46
- # if empty then it's a plain text without style
47
- if rStyle.empty?
48
- # update text object
49
- @text += content(r, ".//t")
50
- else
51
- # otherwise create metadata field
52
- name = rStyle.first.value
53
- value = content(r, ".//t")
54
- # add into container
55
- add_metadata("text", name, value, { in: @text.length })
56
- end
38
+ # prepare group
39
+ group = pStyle.first.value unless pStyle.empty?
40
+ group ||= NONE
41
+ # position in for paragraph against plain text
42
+ paragraph_in = @text.length
43
+ # iterate over r blocks
44
+ p.xpath(".//r").each do |r|
45
+ # store rstyle value
46
+ rStyle = r.xpath(".//rStyle/@val")
47
+ # if empty then it's a plain text without style
48
+ if (rStyle.empty? or rStyle.first.value === 'Text') and pStyle.empty?
49
+ # update text object
50
+ @text += content(r, ".//t")
51
+ else
52
+ # otherwise create metadata field
53
+ # prepare type
54
+ type = rStyle.first.value unless rStyle.empty?
55
+ type ||= NONE
56
+ # prepare value
57
+ value = content(r, ".//t")
58
+ # add into container
59
+ add_metadata(group, type, value, { in: @text.length })
57
60
  end
58
- # position out for paragraph
59
- paragraph_out = @text.length
60
- # filter paragraphs without content
61
- if paragraph_in != paragraph_out
62
- add_metadata("text", PARAGRAPH, nil, { in: paragraph_in, out: paragraph_out })
61
+ end
62
+ # position out for paragraph
63
+ paragraph_out = @text.length
64
+ # filter paragraphs without content
65
+ add_metadata(PARAGRAPH, group, NONE, { in: paragraph_in, out: paragraph_out })
66
+ # limit check
67
+ if @limit > 0
68
+ if @metadata.size >= @limit
69
+ # when meets limit break
70
+ break
63
71
  end
64
72
  end
65
73
  end
@@ -21,6 +21,6 @@
21
21
 
22
22
  module Diabible
23
23
  module Parser
24
- VERSION = "0.1.6"
24
+ VERSION = "0.3.1"
25
25
  end
26
26
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: diabible-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michal Mocnak
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-01 00:00:00.000000000 Z
11
+ date: 2021-03-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -65,8 +65,9 @@ files:
65
65
  - lib/diabible/parser.rb
66
66
  - lib/diabible/parser/document.rb
67
67
  - lib/diabible/parser/helpers.rb
68
+ - lib/diabible/parser/helpers/constants.rb
68
69
  - lib/diabible/parser/helpers/info.rb
69
- - lib/diabible/parser/helpers/notes.rb
70
+ - lib/diabible/parser/helpers/metadata.rb
70
71
  - lib/diabible/parser/helpers/text.rb
71
72
  - lib/diabible/parser/version.rb
72
73
  - spec/spec_helper.rb