chupa-text 1.3.4 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +12 -2
- data/doc/text/news.md +18 -0
- data/lib/chupa-text/decomposers/csv.rb +3 -3
- data/lib/chupa-text/decomposers/office-open-xml-document.rb +1 -1
- data/lib/chupa-text/decomposers/office-open-xml-presentation.rb +2 -2
- data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +3 -3
- data/lib/chupa-text/decomposers/office-open-xml.rb +59 -6
- data/lib/chupa-text/decomposers/opendocument-presentation.rb +2 -2
- data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +4 -4
- data/lib/chupa-text/decomposers/opendocument-text.rb +2 -2
- data/lib/chupa-text/decomposers/xml.rb +2 -2
- data/lib/chupa-text/sax-parser.rb +17 -6
- data/lib/chupa-text/version.rb +2 -2
- data/test/decomposers/test-office-open-xml-workbook.rb +76 -1
- data/test/fixture/xlsx/complex-shared-strings.xlsx +0 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 111930c1ba73f6eaae79fb538e34a51e72fd29768bca71ec89d43f041c34d960
|
4
|
+
data.tar.gz: 4051f7bc52f057a2e06cd7c7d23d3d8ce1755aae47b2352cc8d9e9e2dddfdfde
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9ef8a9c17bf65b1d1a7e26112390964738985c3c9b761dc37c157292a324045eba7565576523817fe738f6158aa1d0362260cf13d28360baec2aa8e9942f2fdc
|
7
|
+
data.tar.gz: 75f63640af7ad7a55d0925dcd37ee23e7bd64c83291d80f0796d6e18fff86f2b76340170c7fa1be572070a4bdee9502a8aa9dfbf7e9946fb4dbd06e6fdb1edbf
|
data/Rakefile
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
# -*-
|
1
|
+
# -*- ruby -*-
|
2
2
|
#
|
3
|
-
# Copyright (C) 2013 Kouhei
|
3
|
+
# Copyright (C) 2013-2025 Sutou Kouhei <kou@clear-code.com>
|
4
4
|
#
|
5
5
|
# This library is free software; you can redistribute it and/or
|
6
6
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -46,3 +46,13 @@ desc "Run tests"
|
|
46
46
|
task :test do
|
47
47
|
ruby("test/run-test.rb")
|
48
48
|
end
|
49
|
+
|
50
|
+
release_task = Rake.application["release"]
|
51
|
+
# We use Trusted Publishing.
|
52
|
+
release_task.prerequisites.delete("build")
|
53
|
+
release_task.prerequisites.delete("release:rubygem_push")
|
54
|
+
release_task_comment = release_task.comment
|
55
|
+
if release_task_comment
|
56
|
+
release_task.clear_comments
|
57
|
+
release_task.comment = release_task_comment.gsub(/ and build.*$/, "")
|
58
|
+
end
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,23 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 1.3.6: 2025-01-10
|
4
|
+
|
5
|
+
### Fixes
|
6
|
+
|
7
|
+
* xlsx: Fixed a bug that wrong text is extracted with complex shared
|
8
|
+
strings.
|
9
|
+
* Reported by Tomohisa Kusukawa.
|
10
|
+
|
11
|
+
### Thanks
|
12
|
+
|
13
|
+
* Tomohisa Kusukawa
|
14
|
+
|
15
|
+
## 1.3.5: 2024-09-22
|
16
|
+
|
17
|
+
### Improvements
|
18
|
+
|
19
|
+
* Added support for REXML 3.3.2 or later.
|
20
|
+
|
3
21
|
## 1.3.4: 2024-09-22
|
4
22
|
|
5
23
|
### Improvements
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -36,7 +36,7 @@ module ChupaText
|
|
36
36
|
end
|
37
37
|
|
38
38
|
def decompose(data)
|
39
|
-
text = ""
|
39
|
+
text = +""
|
40
40
|
data.open do |input|
|
41
41
|
begin
|
42
42
|
csv = ::CSV.new(input, liberal_parsing: true)
|
@@ -68,7 +68,7 @@ module ChupaText
|
|
68
68
|
width, height = data.expected_screenshot_size
|
69
69
|
max_n_lines = 10
|
70
70
|
font_size = height / max_n_lines
|
71
|
-
target_text = ""
|
71
|
+
target_text = +""
|
72
72
|
text.each_line.with_index do |line, i|
|
73
73
|
break if i == max_n_lines
|
74
74
|
target_text << line
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2019 Kouhei
|
1
|
+
# Copyright (C) 2019-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -56,7 +56,7 @@ module ChupaText
|
|
56
56
|
case entry.zip_path
|
57
57
|
when /\Appt\/slides\/slide(\d+)\.xml/
|
58
58
|
nth_slide = Integer($1, 10)
|
59
|
-
slide_text = ""
|
59
|
+
slide_text = +""
|
60
60
|
extract_text(entry, slide_text)
|
61
61
|
context[:slides] << [nth_slide, slide_text]
|
62
62
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2019-
|
1
|
+
# Copyright (C) 2019-2025 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -49,7 +49,7 @@ module ChupaText
|
|
49
49
|
def process_entry(entry, context)
|
50
50
|
case entry.zip_path
|
51
51
|
when "xl/sharedStrings.xml"
|
52
|
-
|
52
|
+
extract_shared_strings(entry, context[:shared_strings])
|
53
53
|
when "xl/workbook.xml"
|
54
54
|
listener = WorkbookListener.new(context[:sheet_names])
|
55
55
|
parse(entry.file_data, listener)
|
@@ -73,7 +73,7 @@ module ChupaText
|
|
73
73
|
sheets = context[:sheets].sort_by(&:first).collect(&:last)
|
74
74
|
sheet_names = context[:sheet_names]
|
75
75
|
sheets.each_with_index do |sheet, i|
|
76
|
-
sheet_text = ""
|
76
|
+
sheet_text = +""
|
77
77
|
sheet.each do |row|
|
78
78
|
row_texts = row.collect do |cell|
|
79
79
|
case cell
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2019 Kouhei
|
1
|
+
# Copyright (C) 2019-2025 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -70,6 +70,11 @@ module ChupaText
|
|
70
70
|
parse(entry.file_data, listener)
|
71
71
|
end
|
72
72
|
|
73
|
+
def extract_shared_strings(entry, shared_strings)
|
74
|
+
listener = SharedStringsListener.new(shared_strings, @namespace_uri)
|
75
|
+
parse(entry.file_data, listener)
|
76
|
+
end
|
77
|
+
|
73
78
|
def log_tag
|
74
79
|
"[decomposer][office-open-xml]"
|
75
80
|
end
|
@@ -90,26 +95,74 @@ module ChupaText
|
|
90
95
|
end
|
91
96
|
|
92
97
|
def end_element(uri, local_name, qname)
|
98
|
+
if uri == @target_uri
|
99
|
+
case local_name
|
100
|
+
when "p", "br"
|
101
|
+
@output << "\n"
|
102
|
+
when "t"
|
103
|
+
@in_target = false
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def characters(text)
|
109
|
+
add_text(text)
|
110
|
+
end
|
111
|
+
|
112
|
+
def cdata(content)
|
113
|
+
add_text(content)
|
114
|
+
end
|
115
|
+
|
116
|
+
private
|
117
|
+
def add_text(text)
|
118
|
+
return unless @in_target
|
119
|
+
@output << text
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
class SharedStringsListener < SAXListener
|
124
|
+
def initialize(output, target_uri)
|
125
|
+
@output = output
|
126
|
+
@target_uri = target_uri
|
127
|
+
@tag_stack = []
|
93
128
|
@in_target = false
|
129
|
+
@current_text = +""
|
130
|
+
end
|
131
|
+
|
132
|
+
def start_element(uri, local_name, qname, attributes)
|
133
|
+
@tag_stack << local_name
|
134
|
+
|
135
|
+
return unless uri == @target_uri
|
136
|
+
case local_name
|
137
|
+
when "t"
|
138
|
+
@in_target = true
|
139
|
+
@current_text = +""
|
140
|
+
end
|
141
|
+
end
|
94
142
|
|
143
|
+
def end_element(uri, local_name, qname)
|
95
144
|
return unless uri == @target_uri
|
96
145
|
case local_name
|
97
|
-
when "
|
98
|
-
@
|
146
|
+
when "t"
|
147
|
+
add_text(@current_text)
|
148
|
+
@in_target = false
|
99
149
|
end
|
150
|
+
ensure
|
151
|
+
@tag_stack.pop
|
100
152
|
end
|
101
153
|
|
102
154
|
def characters(text)
|
103
|
-
|
155
|
+
@current_text << text if @in_target
|
104
156
|
end
|
105
157
|
|
106
158
|
def cdata(content)
|
107
|
-
|
159
|
+
@current_text << content if @in_target
|
108
160
|
end
|
109
161
|
|
110
162
|
private
|
111
163
|
def add_text(text)
|
112
|
-
|
164
|
+
parent_tag = @tag_stack[-2]
|
165
|
+
return unless parent_tag == "si"
|
113
166
|
@output << text
|
114
167
|
end
|
115
168
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2019 Kouhei
|
1
|
+
# Copyright (C) 2019-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -72,7 +72,7 @@ module ChupaText
|
|
72
72
|
when DRAW_URI
|
73
73
|
case local_name
|
74
74
|
when "page"
|
75
|
-
@slides << {text: ""}
|
75
|
+
@slides << {text: +""}
|
76
76
|
end
|
77
77
|
end
|
78
78
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2019 Kouhei
|
1
|
+
# Copyright (C) 2019-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -96,9 +96,9 @@ module ChupaText
|
|
96
96
|
when "table-row"
|
97
97
|
@sheets.last[:rows] << []
|
98
98
|
when "table-cell"
|
99
|
-
@sheets.last[:rows].last << {text: ""}
|
99
|
+
@sheets.last[:rows].last << {text: +""}
|
100
100
|
when "covered-table-cell"
|
101
|
-
@sheets.last[:rows].last << {text: ""}
|
101
|
+
@sheets.last[:rows].last << {text: +""}
|
102
102
|
when "shapes"
|
103
103
|
@in_shapes = true
|
104
104
|
end
|
@@ -116,7 +116,7 @@ module ChupaText
|
|
116
116
|
case local_name
|
117
117
|
when "table"
|
118
118
|
sheet = @sheets.last
|
119
|
-
text = ""
|
119
|
+
text = +""
|
120
120
|
shape_texts = sheet[:shape_texts]
|
121
121
|
unless shape_texts.empty?
|
122
122
|
text << shape_texts.join("\n") << "\n"
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2019 Kouhei
|
1
|
+
# Copyright (C) 2019-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -29,7 +29,7 @@ module ChupaText
|
|
29
29
|
|
30
30
|
private
|
31
31
|
def process_content(entry, context, &block)
|
32
|
-
context[:text] = ""
|
32
|
+
context[:text] = +""
|
33
33
|
listener = TextListener.new(context[:text])
|
34
34
|
parse(entry.file_data, listener)
|
35
35
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -29,7 +29,7 @@ module ChupaText
|
|
29
29
|
end
|
30
30
|
|
31
31
|
def decompose(data)
|
32
|
-
text = ""
|
32
|
+
text = +""
|
33
33
|
listener = Listener.new(text)
|
34
34
|
data.open do |input|
|
35
35
|
begin
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2019 Kouhei
|
1
|
+
# Copyright (C) 2019-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -16,6 +16,7 @@
|
|
16
16
|
|
17
17
|
require "cgi/util"
|
18
18
|
require "rexml/parsers/sax2parser"
|
19
|
+
require "rexml/rexml"
|
19
20
|
require "rexml/sax2listener"
|
20
21
|
|
21
22
|
begin
|
@@ -156,12 +157,22 @@ module ChupaText
|
|
156
157
|
@listener.end_element(*args)
|
157
158
|
end
|
158
159
|
|
159
|
-
|
160
|
-
|
161
|
-
|
160
|
+
if (REXML::VERSION <=> "3.3.2") >= 0
|
161
|
+
def characters(text)
|
162
|
+
@listener.characters(text)
|
163
|
+
end
|
164
|
+
|
165
|
+
def cdata(content)
|
166
|
+
@listener.cdata(content)
|
167
|
+
end
|
168
|
+
else
|
169
|
+
def characters(text)
|
170
|
+
@listener.characters(CGI.unescapeHTML(text))
|
171
|
+
end
|
162
172
|
|
163
|
-
|
164
|
-
|
173
|
+
def cdata(content)
|
174
|
+
@listener.cdata(CGI.unescapeHTML(content))
|
175
|
+
end
|
165
176
|
end
|
166
177
|
end
|
167
178
|
end
|
data/lib/chupa-text/version.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -15,5 +15,5 @@
|
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
17
|
module ChupaText
|
18
|
-
VERSION = "1.3.
|
18
|
+
VERSION = "1.3.6"
|
19
19
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2019 Kouhei
|
1
|
+
# Copyright (C) 2019-2025 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -153,6 +153,81 @@ class TestDecomposersOfficeOpenXMLWorkbook < Test::Unit::TestCase
|
|
153
153
|
end
|
154
154
|
end
|
155
155
|
|
156
|
+
def test_complex_shared_strings
|
157
|
+
path = fixture_path("xlsx", "complex-shared-strings.xlsx")
|
158
|
+
actual = decompose(path).collect do |data|
|
159
|
+
[
|
160
|
+
data["index"],
|
161
|
+
data["name"],
|
162
|
+
data.body,
|
163
|
+
]
|
164
|
+
end
|
165
|
+
assert_equal([
|
166
|
+
[nil, nil, ""],
|
167
|
+
[
|
168
|
+
0,
|
169
|
+
"新規",
|
170
|
+
"No\t案件番号\t開始日\t期日\tステータス\t備考\n" +
|
171
|
+
"1\t-\t45664\t45672\t対応中\n" +
|
172
|
+
"2\t-\t45664\t45672\t対応中\n" +
|
173
|
+
"3\t-\t45664\t45672\t対応中\n" +
|
174
|
+
"4\t-\t45664\t45666\t対応中\n" +
|
175
|
+
"5\t-\t45664\t45666\t対応中\n" +
|
176
|
+
"6\t-\t45663\t45665\t承認待ち\n" +
|
177
|
+
"7\t-\t45660\t45665\t承認待ち\n" +
|
178
|
+
"8\t-\t45653\t45663\t承認待ち\n" +
|
179
|
+
"9\t-\t45653\t45663\t承認待ち\n" +
|
180
|
+
"10\tPSR2401770\t45652\t45666\t対応中\n",
|
181
|
+
],
|
182
|
+
[
|
183
|
+
1,
|
184
|
+
"全体",
|
185
|
+
"No\t案件番号\t開始日\t期日\tステータス\n" +
|
186
|
+
"1\tPSR2401564\t45617\t45726\t対応中\n" +
|
187
|
+
"2\tPSR2401194\t45553\t45716\t対応中\n" +
|
188
|
+
"3\t-\t45664\t45672\t対応中\n" +
|
189
|
+
"4\t-\t45664\t45672\t対応中\n" +
|
190
|
+
"5\t-\t45664\t45672\t対応中\n" +
|
191
|
+
"6\t-\t45645\t45672\t対応中\n" +
|
192
|
+
"7\tPSR2401746\t45649\t45671\t対応中\n" +
|
193
|
+
"8\t-\t45640\t45667\t対応中\n" +
|
194
|
+
"9\t-\t45635\t45667\t対応中\n" +
|
195
|
+
"10\tPSR2401605\t45623\t45667\t対応中\n" +
|
196
|
+
"11\t-\t45664\t45666\t対応中\n" +
|
197
|
+
"12\t-\t45664\t45666\t対応中\n" +
|
198
|
+
"13\tPSR2401770\t45652\t45666\t対応中\n" +
|
199
|
+
"14\t-\t45645\t45665\t対応中\n" +
|
200
|
+
"15\tPSR2401609\t45624\t45666\t対応中\n",
|
201
|
+
],
|
202
|
+
[
|
203
|
+
2,
|
204
|
+
"案件",
|
205
|
+
"No\t案件番号\t開始日\t対応完了時期想定\n" +
|
206
|
+
"1\tPSR2401244\t45561.40347222222\t45744\n" +
|
207
|
+
"2\tPSR2401592\t45621.598611111112\t45698\n" +
|
208
|
+
"3\tPSR2401682\t45638.40902777778\t45688\n" +
|
209
|
+
"4\tPSR2401706\t45643.383333333331\t45671\n" +
|
210
|
+
"5\tPSR2401779\t45653.490277777775\t45671\n" +
|
211
|
+
"6\tPSR2401805\t45664.436805555553\t調整中\n" +
|
212
|
+
"7\tPSR2400677\t45455.588194444441\t45651\t完了\n" +
|
213
|
+
"8\tPSR2401666\t45636.405555555553\t45653\t完了\n" +
|
214
|
+
"9\tPSR2401714\t45644.630555555559\t45652\t完了\n",
|
215
|
+
],
|
216
|
+
[
|
217
|
+
3,
|
218
|
+
"障害恒久対応・改善対応",
|
219
|
+
"No\t案件番号\t分類\t開始日\t対応完了時期想定\n" +
|
220
|
+
"1\tPSR2401334\t改善対応\t45576.411805555559\t45688\n" +
|
221
|
+
"2\tPSR2401335\t改善対応\t45576.415277777778\t45688\n" +
|
222
|
+
"3\tPSR2401410\t改善対応\t45588.428472222222\t調整中\n" +
|
223
|
+
"4\tPSR2401411\t改善対応\t45588.432638888888\t調整中\n" +
|
224
|
+
"5\tPSR2401718\t障害恒久対応\t45645.386111111111\t45695\n" +
|
225
|
+
"6\tPSR2401807\t障害恒久対応\t45664.546527777777\t調整中\t1/16リリースで調整中\n",
|
226
|
+
],
|
227
|
+
],
|
228
|
+
actual)
|
229
|
+
end
|
230
|
+
|
156
231
|
sub_test_case("invalid") do
|
157
232
|
def test_empty
|
158
233
|
messages = capture_log do
|
Binary file
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chupa-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sutou Kouhei
|
8
8
|
bindir: bin
|
9
9
|
cert_chain: []
|
10
|
-
date:
|
10
|
+
date: 2025-01-10 00:00:00.000000000 Z
|
11
11
|
dependencies:
|
12
12
|
- !ruby/object:Gem::Dependency
|
13
13
|
name: archive-zip
|
@@ -189,6 +189,7 @@ files:
|
|
189
189
|
- test/fixture/tar/top-level.tar
|
190
190
|
- test/fixture/tar/utf-8.tar
|
191
191
|
- test/fixture/xlsx/attributes.xlsx
|
192
|
+
- test/fixture/xlsx/complex-shared-strings.xlsx
|
192
193
|
- test/fixture/xlsx/empty.xlsx
|
193
194
|
- test/fixture/xlsx/multi-sheets.xlsx
|
194
195
|
- test/fixture/xlsx/not-shared-cell.xlsx
|
@@ -232,7 +233,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
232
233
|
- !ruby/object:Gem::Version
|
233
234
|
version: '0'
|
234
235
|
requirements: []
|
235
|
-
rubygems_version: 3.6.
|
236
|
+
rubygems_version: 3.6.2
|
236
237
|
specification_version: 4
|
237
238
|
summary: ChupaText is an extensible text extractor. You can plug your custom text
|
238
239
|
extractor in ChupaText. You can write your plugin by Ruby.
|