diabible-parser 0.1.6 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4890da70174e919b49e385c291425fc5c2941230f410107b41aede336ca931ef
4
- data.tar.gz: 5f97266c8dcd64e27a8a208fca5d8239abfd446c91a3a4cf6d12bfa5a83df79c
3
+ metadata.gz: c8b00b4ed3517b91ce76c3d217e46ab5e3a7a43c95776517d75edd2102661f46
4
+ data.tar.gz: 1ce6691990f539cc0fd2841400e03d29f6d32019f9f7ecd3f0560180896281a9
5
5
  SHA512:
6
- metadata.gz: dabfdfaaf2e7a256d9cdc94d29a68973602f9a254a55206dd35cc5ffcd710af27b3b6d0461c119ca4b2cad82cecdce85e01a385c931d9b9bda10ba09ff1d3ede
7
- data.tar.gz: c8545b7ffa872bb1161917be990d0c50297018512c30730e97f039bc920689cd1b9f5d6ec7d62e9b697b132a33bf87ac2185a047592973ee0535aacc26cefb51
6
+ metadata.gz: 4f6abc6a44871efb78474f513c9d79d217e26baec0033c89a51fca26c248df10c3d0ab6401c67cca656ccecdd8aad2f3d511df56135828e6593b755ebf2ecbcc
7
+ data.tar.gz: cab40cfebe6c0c7a3a03bb74c82bd4ce28caeb14ad820c20fcc252c5812c00c9eb3d654ab02b9e41b0e0fa6135031cd7b3bf7bdafafd44f2eae6b8ba15366064
@@ -29,11 +29,11 @@ module Diabible
29
29
 
30
30
  class Document
31
31
  include Helpers::Info
32
- include Helpers::Notes
32
+ include Helpers::Metadata
33
33
  include Helpers::Text
34
34
 
35
35
  # default constructor
36
- def initialize(document, info: true, notes: true, text: true)
36
+ def initialize(document, info: true, text: true, limit: 0)
37
37
  Zip::File.open(URI.open(document)) do |zip_file|
38
38
  # open content xml
39
39
  @document = Nokogiri::XML(zip_file.read("word/document.xml"))
@@ -43,9 +43,9 @@ module Diabible
43
43
  # defaults
44
44
  @text = ''
45
45
  @metadata = []
46
+ @limit = limit
46
47
  # parse
47
48
  parse_info if info
48
- parse_notes if notes
49
49
  parse_text if text
50
50
  end
51
51
 
@@ -70,19 +70,9 @@ module Diabible
70
70
 
71
71
  private
72
72
 
73
- def info_count
74
- @info_count ||= 0
75
- @info_count += 1
76
- end
77
-
78
- def note_count
79
- @note_count ||= 0
80
- @note_count += 1
81
- end
82
-
83
- def text_count
84
- @text_count ||= 0
85
- @text_count += 1
73
+ def count
74
+ @count ||= 0
75
+ @count += 1
86
76
  end
87
77
 
88
78
  # get text content of all nodes matched by xpath
@@ -96,18 +86,6 @@ module Diabible
96
86
  # return container
97
87
  content
98
88
  end
99
-
100
- # add meta hash
101
- def add_metadata(name, type, value, position = nil)
102
- # firstly check if the last field is not a same type
103
- if @metadata.last and @metadata.last[:type] != PARAGRAPH and @metadata.last[:type] == type
104
- # merge fields together
105
- @metadata.last[:value] += value
106
- else
107
- # normally add a new meta hash with uniqe namee
108
- @metadata << { name: "#{name}_#{send("#{name}_count")}_#{type}", type: type, value: value, position: position }
109
- end
110
- end
111
89
  end
112
90
  end
113
91
  end
@@ -19,8 +19,9 @@
19
19
  # Authors: Michal Mocnak <michal@marigan.net>
20
20
  #
21
21
 
22
+ require 'diabible/parser/helpers/constants'
22
23
  require 'diabible/parser/helpers/info'
23
- require 'diabible/parser/helpers/notes'
24
+ require 'diabible/parser/helpers/metadata'
24
25
  require 'diabible/parser/helpers/text'
25
26
 
26
27
  module Diabible
@@ -22,28 +22,10 @@
22
22
  module Diabible
23
23
  module Parser
24
24
  module Helpers
25
- module Notes
25
+ module Constants
26
26
 
27
- private
28
-
29
- # parse notes box
30
- def parse_notes
31
- # iterate through all paragraphs with style
32
- @document.xpath("//body/p").each do |p|
33
- # store pStyle value
34
- pStyle = p.xpath(".//pStyle/@val")
35
- # process only those with pstyle
36
- unless pStyle.empty? or ['Hlavicka', 'Incipit', 'Podnadpis', 'Podnadpis1'].include?(pStyle.first.value)
37
- # iterate over r blocks
38
- p.xpath(".//r").each do |r|
39
- name = pStyle.first.value
40
- value = content(r, ".//t")
41
- # add into container
42
- add_metadata("note", name, value)
43
- end
44
- end
45
- end
46
- end
27
+ NONE = 'none'
28
+ PARAGRAPH = 'paragraph'
47
29
  end
48
30
  end
49
31
  end
@@ -0,0 +1,71 @@
1
+ #
2
+ # Copyright (C) 2021 diabible.com
3
+ #
4
+ # This file is part of Diabible Parser.
5
+ #
6
+ # Diabible Parser is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License as published by
8
+ # the Free Software Foundation, either version 3 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # Diabible Parser is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with Diabible Parser. If not, see <http://www.gnu.org/licenses/>.
18
+ #
19
+ # Authors: Michal Mocnak <michal@marigan.net>
20
+ #
21
+
22
+ module Diabible
23
+ module Parser
24
+ module Helpers
25
+ module Metadata
26
+
27
+ # add meta hash
28
+ def add_metadata(group, type, value, position = nil)
29
+ # get index
30
+ index = count
31
+ # normally add a new meta hash with uniqe name
32
+ @metadata << { name: "#{index}_#{group}_#{type}", group: group, type: type, index: index, value: value, position: position }
33
+ end
34
+
35
+ # get metadata array per group or type
36
+ def get_metadata(group: nil, type: nil, filter: :exclusive, filters: [])
37
+ if group and type
38
+ return metadata_query(filter: filter, filters: filters) do |m|
39
+ m[:group] == group and m[:type] == type
40
+ end
41
+ elsif group
42
+ return metadata_query(filter: filter, filters: filters) do |m|
43
+ m[:group] == group
44
+ end
45
+ elsif type
46
+ return metadata_query(filter: filter, filters: filters) do |m|
47
+ m[:type] == type
48
+ end
49
+ else
50
+ return metadata_query(filter: filter, filters: filters) do |m|
51
+ true
52
+ end
53
+ end
54
+ end
55
+
56
+ def metadata_query(filter: :exclusive, filters: [])
57
+ return @metadata.select do |m|
58
+ if filters.empty?
59
+ yield m
60
+ else
61
+ # get filters intersection
62
+ filtered = filters & [m[:type], m[:group]]
63
+ # resolution
64
+ (yield m) and (filter == :exclusive ? filtered.empty? : !filtered.empty?)
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
@@ -19,47 +19,55 @@
19
19
  # Authors: Michal Mocnak <michal@marigan.net>
20
20
  #
21
21
 
22
+ require 'diabible/parser/helpers/constants'
23
+
22
24
  module Diabible
23
25
  module Parser
24
26
  module Helpers
25
27
  module Text
28
+ include Diabible::Parser::Helpers::Constants
26
29
 
27
30
  private
28
31
 
29
- # paragraph type id
30
- PARAGRAPH = 'paragraph'
31
-
32
32
  # parse main text entries
33
33
  def parse_text
34
34
  # iterate through all paragraphs without style
35
35
  @document.xpath("//body/p").each do |p|
36
36
  # store pStyle value
37
37
  pStyle = p.xpath(".//pStyle/@val")
38
- # process only those without pstyle
39
- if pStyle.empty? or (pStyle.size and ['Incipit', 'Podnadpis', 'Podnadpis1'].include?(pStyle.first.value))
40
- # position in for paragraph
41
- paragraph_in = @text.length
42
- # iterate over r blocks
43
- p.xpath(".//r").each do |r|
44
- # store rstyle value
45
- rStyle = r.xpath(".//rStyle/@val")
46
- # if empty then it's a plain text without style
47
- if rStyle.empty?
48
- # update text object
49
- @text += content(r, ".//t")
50
- else
51
- # otherwise create metadata field
52
- name = rStyle.first.value
53
- value = content(r, ".//t")
54
- # add into container
55
- add_metadata("text", name, value, { in: @text.length })
56
- end
38
+ # prepare group
39
+ group = pStyle.first.value unless pStyle.empty?
40
+ group ||= NONE
41
+ # position in for paragraph against plain text
42
+ paragraph_in = @text.length
43
+ # iterate over r blocks
44
+ p.xpath(".//r").each do |r|
45
+ # store rstyle value
46
+ rStyle = r.xpath(".//rStyle/@val")
47
+ # if empty then it's a plain text without style
48
+ if (rStyle.empty? or rStyle.first.value === 'Text') and pStyle.empty?
49
+ # update text object
50
+ @text += content(r, ".//t")
51
+ else
52
+ # otherwise create metadata field
53
+ # prepare type
54
+ type = rStyle.first.value unless rStyle.empty?
55
+ type ||= NONE
56
+ # prepare value
57
+ value = content(r, ".//t")
58
+ # add into container
59
+ add_metadata(group, type, value, { in: @text.length })
57
60
  end
58
- # position out for paragraph
59
- paragraph_out = @text.length
60
- # filter paragraphs without content
61
- if paragraph_in != paragraph_out
62
- add_metadata("text", PARAGRAPH, nil, { in: paragraph_in, out: paragraph_out })
61
+ end
62
+ # position out for paragraph
63
+ paragraph_out = @text.length
64
+ # filter paragraphs without content
65
+ add_metadata(PARAGRAPH, group, NONE, { in: paragraph_in, out: paragraph_out })
66
+ # limit check
67
+ if @limit > 0
68
+ if @metadata.size >= @limit
69
+ # when meets limit break
70
+ break
63
71
  end
64
72
  end
65
73
  end
@@ -21,6 +21,6 @@
21
21
 
22
22
  module Diabible
23
23
  module Parser
24
- VERSION = "0.1.6"
24
+ VERSION = "0.3.1"
25
25
  end
26
26
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: diabible-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michal Mocnak
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-01 00:00:00.000000000 Z
11
+ date: 2021-03-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -65,8 +65,9 @@ files:
65
65
  - lib/diabible/parser.rb
66
66
  - lib/diabible/parser/document.rb
67
67
  - lib/diabible/parser/helpers.rb
68
+ - lib/diabible/parser/helpers/constants.rb
68
69
  - lib/diabible/parser/helpers/info.rb
69
- - lib/diabible/parser/helpers/notes.rb
70
+ - lib/diabible/parser/helpers/metadata.rb
70
71
  - lib/diabible/parser/helpers/text.rb
71
72
  - lib/diabible/parser/version.rb
72
73
  - spec/spec_helper.rb