chupa-text 1.3.4 → 1.3.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d81031d80654a357700972d1443aea5994ba4e3f4a1684e670a19b7041b12073
4
- data.tar.gz: 17d66e6f6660066fd8c4fd83baf01eee8c2296a2a7a777ab7c0db3bbf9917d56
3
+ metadata.gz: 111930c1ba73f6eaae79fb538e34a51e72fd29768bca71ec89d43f041c34d960
4
+ data.tar.gz: 4051f7bc52f057a2e06cd7c7d23d3d8ce1755aae47b2352cc8d9e9e2dddfdfde
5
5
  SHA512:
6
- metadata.gz: e3ab55e7923da653b4c277491712e6564db633b664830ad2153b60b0ada7d09de69576074a0a3592406b03041f09990fa7a0daf0fd3f99d798543d22d5416a2c
7
- data.tar.gz: ab8e04142b49119e51c9d7da40776efc674e75034148670c7f921a06320896426f7aba99a42ff467a97d07bd89c776bc4e04ddfa788ce826d51481baf8890470
6
+ metadata.gz: 9ef8a9c17bf65b1d1a7e26112390964738985c3c9b761dc37c157292a324045eba7565576523817fe738f6158aa1d0362260cf13d28360baec2aa8e9942f2fdc
7
+ data.tar.gz: 75f63640af7ad7a55d0925dcd37ee23e7bd64c83291d80f0796d6e18fff86f2b76340170c7fa1be572070a4bdee9502a8aa9dfbf7e9946fb4dbd06e6fdb1edbf
data/Rakefile CHANGED
@@ -1,6 +1,6 @@
1
- # -*- mode: ruby; coding: utf-8 -*-
1
+ # -*- ruby -*-
2
2
  #
3
- # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
3
+ # Copyright (C) 2013-2025 Sutou Kouhei <kou@clear-code.com>
4
4
  #
5
5
  # This library is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the GNU Lesser General Public
@@ -46,3 +46,13 @@ desc "Run tests"
46
46
  task :test do
47
47
  ruby("test/run-test.rb")
48
48
  end
49
+
50
+ release_task = Rake.application["release"]
51
+ # We use Trusted Publishing.
52
+ release_task.prerequisites.delete("build")
53
+ release_task.prerequisites.delete("release:rubygem_push")
54
+ release_task_comment = release_task.comment
55
+ if release_task_comment
56
+ release_task.clear_comments
57
+ release_task.comment = release_task_comment.gsub(/ and build.*$/, "")
58
+ end
data/doc/text/news.md CHANGED
@@ -1,5 +1,23 @@
1
1
  # News
2
2
 
3
+ ## 1.3.6: 2025-01-10
4
+
5
+ ### Fixes
6
+
7
+ * xlsx: Fixed a bug that wrong text is extracted with complex shared
8
+ strings.
9
+ * Reported by Tomohisa Kusukawa.
10
+
11
+ ### Thanks
12
+
13
+ * Tomohisa Kusukawa
14
+
15
+ ## 1.3.5: 2024-09-22
16
+
17
+ ### Improvements
18
+
19
+ * Added support for REXML 3.3.2 or later.
20
+
3
21
  ## 1.3.4: 2024-09-22
4
22
 
5
23
  ### Improvements
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -36,7 +36,7 @@ module ChupaText
36
36
  end
37
37
 
38
38
  def decompose(data)
39
- text = ""
39
+ text = +""
40
40
  data.open do |input|
41
41
  begin
42
42
  csv = ::CSV.new(input, liberal_parsing: true)
@@ -68,7 +68,7 @@ module ChupaText
68
68
  width, height = data.expected_screenshot_size
69
69
  max_n_lines = 10
70
70
  font_size = height / max_n_lines
71
- target_text = ""
71
+ target_text = +""
72
72
  text.each_line.with_index do |line, i|
73
73
  break if i == max_n_lines
74
74
  target_text << line
@@ -41,7 +41,7 @@ module ChupaText
41
41
 
42
42
  private
43
43
  def start_decompose(context)
44
- context[:text] = ""
44
+ context[:text] = +""
45
45
  end
46
46
 
47
47
  def process_entry(entry, context)
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2019-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -56,7 +56,7 @@ module ChupaText
56
56
  case entry.zip_path
57
57
  when /\Appt\/slides\/slide(\d+)\.xml/
58
58
  nth_slide = Integer($1, 10)
59
- slide_text = ""
59
+ slide_text = +""
60
60
  extract_text(entry, slide_text)
61
61
  context[:slides] << [nth_slide, slide_text]
62
62
  end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2019-2022 Sutou Kouhei <kou@clear-code.com>
1
+ # Copyright (C) 2019-2025 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -49,7 +49,7 @@ module ChupaText
49
49
  def process_entry(entry, context)
50
50
  case entry.zip_path
51
51
  when "xl/sharedStrings.xml"
52
- extract_text(entry, context[:shared_strings])
52
+ extract_shared_strings(entry, context[:shared_strings])
53
53
  when "xl/workbook.xml"
54
54
  listener = WorkbookListener.new(context[:sheet_names])
55
55
  parse(entry.file_data, listener)
@@ -73,7 +73,7 @@ module ChupaText
73
73
  sheets = context[:sheets].sort_by(&:first).collect(&:last)
74
74
  sheet_names = context[:sheet_names]
75
75
  sheets.each_with_index do |sheet, i|
76
- sheet_text = ""
76
+ sheet_text = +""
77
77
  sheet.each do |row|
78
78
  row_texts = row.collect do |cell|
79
79
  case cell
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2019-2025 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -70,6 +70,11 @@ module ChupaText
70
70
  parse(entry.file_data, listener)
71
71
  end
72
72
 
73
+ def extract_shared_strings(entry, shared_strings)
74
+ listener = SharedStringsListener.new(shared_strings, @namespace_uri)
75
+ parse(entry.file_data, listener)
76
+ end
77
+
73
78
  def log_tag
74
79
  "[decomposer][office-open-xml]"
75
80
  end
@@ -90,26 +95,74 @@ module ChupaText
90
95
  end
91
96
 
92
97
  def end_element(uri, local_name, qname)
98
+ if uri == @target_uri
99
+ case local_name
100
+ when "p", "br"
101
+ @output << "\n"
102
+ when "t"
103
+ @in_target = false
104
+ end
105
+ end
106
+ end
107
+
108
+ def characters(text)
109
+ add_text(text)
110
+ end
111
+
112
+ def cdata(content)
113
+ add_text(content)
114
+ end
115
+
116
+ private
117
+ def add_text(text)
118
+ return unless @in_target
119
+ @output << text
120
+ end
121
+ end
122
+
123
+ class SharedStringsListener < SAXListener
124
+ def initialize(output, target_uri)
125
+ @output = output
126
+ @target_uri = target_uri
127
+ @tag_stack = []
93
128
  @in_target = false
129
+ @current_text = +""
130
+ end
131
+
132
+ def start_element(uri, local_name, qname, attributes)
133
+ @tag_stack << local_name
134
+
135
+ return unless uri == @target_uri
136
+ case local_name
137
+ when "t"
138
+ @in_target = true
139
+ @current_text = +""
140
+ end
141
+ end
94
142
 
143
+ def end_element(uri, local_name, qname)
95
144
  return unless uri == @target_uri
96
145
  case local_name
97
- when "p", "br"
98
- @output << "\n"
146
+ when "t"
147
+ add_text(@current_text)
148
+ @in_target = false
99
149
  end
150
+ ensure
151
+ @tag_stack.pop
100
152
  end
101
153
 
102
154
  def characters(text)
103
- add_text(text)
155
+ @current_text << text if @in_target
104
156
  end
105
157
 
106
158
  def cdata(content)
107
- add_text(content)
159
+ @current_text << content if @in_target
108
160
  end
109
161
 
110
162
  private
111
163
  def add_text(text)
112
- return unless @in_target
164
+ parent_tag = @tag_stack[-2]
165
+ return unless parent_tag == "si"
113
166
  @output << text
114
167
  end
115
168
  end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2019-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -72,7 +72,7 @@ module ChupaText
72
72
  when DRAW_URI
73
73
  case local_name
74
74
  when "page"
75
- @slides << {text: ""}
75
+ @slides << {text: +""}
76
76
  end
77
77
  end
78
78
  end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2019-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -96,9 +96,9 @@ module ChupaText
96
96
  when "table-row"
97
97
  @sheets.last[:rows] << []
98
98
  when "table-cell"
99
- @sheets.last[:rows].last << {text: ""}
99
+ @sheets.last[:rows].last << {text: +""}
100
100
  when "covered-table-cell"
101
- @sheets.last[:rows].last << {text: ""}
101
+ @sheets.last[:rows].last << {text: +""}
102
102
  when "shapes"
103
103
  @in_shapes = true
104
104
  end
@@ -116,7 +116,7 @@ module ChupaText
116
116
  case local_name
117
117
  when "table"
118
118
  sheet = @sheets.last
119
- text = ""
119
+ text = +""
120
120
  shape_texts = sheet[:shape_texts]
121
121
  unless shape_texts.empty?
122
122
  text << shape_texts.join("\n") << "\n"
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2019-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -29,7 +29,7 @@ module ChupaText
29
29
 
30
30
  private
31
31
  def process_content(entry, context, &block)
32
- context[:text] = ""
32
+ context[:text] = +""
33
33
  listener = TextListener.new(context[:text])
34
34
  parse(entry.file_data, listener)
35
35
  end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -29,7 +29,7 @@ module ChupaText
29
29
  end
30
30
 
31
31
  def decompose(data)
32
- text = ""
32
+ text = +""
33
33
  listener = Listener.new(text)
34
34
  data.open do |input|
35
35
  begin
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2019-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -16,6 +16,7 @@
16
16
 
17
17
  require "cgi/util"
18
18
  require "rexml/parsers/sax2parser"
19
+ require "rexml/rexml"
19
20
  require "rexml/sax2listener"
20
21
 
21
22
  begin
@@ -156,12 +157,22 @@ module ChupaText
156
157
  @listener.end_element(*args)
157
158
  end
158
159
 
159
- def characters(text)
160
- @listener.characters(CGI.unescapeHTML(text))
161
- end
160
+ if (REXML::VERSION <=> "3.3.2") >= 0
161
+ def characters(text)
162
+ @listener.characters(text)
163
+ end
164
+
165
+ def cdata(content)
166
+ @listener.cdata(content)
167
+ end
168
+ else
169
+ def characters(text)
170
+ @listener.characters(CGI.unescapeHTML(text))
171
+ end
162
172
 
163
- def cdata(content)
164
- @listener.cdata(CGI.unescapeHTML(content))
173
+ def cdata(content)
174
+ @listener.cdata(CGI.unescapeHTML(content))
175
+ end
165
176
  end
166
177
  end
167
178
  end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2022 Sutou Kouhei <kou@clear-code.com>
1
+ # Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -15,5 +15,5 @@
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
17
  module ChupaText
18
- VERSION = "1.3.4"
18
+ VERSION = "1.3.6"
19
19
  end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2019-2025 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -153,6 +153,81 @@ class TestDecomposersOfficeOpenXMLWorkbook < Test::Unit::TestCase
153
153
  end
154
154
  end
155
155
 
156
+ def test_complex_shared_strings
157
+ path = fixture_path("xlsx", "complex-shared-strings.xlsx")
158
+ actual = decompose(path).collect do |data|
159
+ [
160
+ data["index"],
161
+ data["name"],
162
+ data.body,
163
+ ]
164
+ end
165
+ assert_equal([
166
+ [nil, nil, ""],
167
+ [
168
+ 0,
169
+ "新規",
170
+ "No\t案件番号\t開始日\t期日\tステータス\t備考\n" +
171
+ "1\t-\t45664\t45672\t対応中\n" +
172
+ "2\t-\t45664\t45672\t対応中\n" +
173
+ "3\t-\t45664\t45672\t対応中\n" +
174
+ "4\t-\t45664\t45666\t対応中\n" +
175
+ "5\t-\t45664\t45666\t対応中\n" +
176
+ "6\t-\t45663\t45665\t承認待ち\n" +
177
+ "7\t-\t45660\t45665\t承認待ち\n" +
178
+ "8\t-\t45653\t45663\t承認待ち\n" +
179
+ "9\t-\t45653\t45663\t承認待ち\n" +
180
+ "10\tPSR2401770\t45652\t45666\t対応中\n",
181
+ ],
182
+ [
183
+ 1,
184
+ "全体",
185
+ "No\t案件番号\t開始日\t期日\tステータス\n" +
186
+ "1\tPSR2401564\t45617\t45726\t対応中\n" +
187
+ "2\tPSR2401194\t45553\t45716\t対応中\n" +
188
+ "3\t-\t45664\t45672\t対応中\n" +
189
+ "4\t-\t45664\t45672\t対応中\n" +
190
+ "5\t-\t45664\t45672\t対応中\n" +
191
+ "6\t-\t45645\t45672\t対応中\n" +
192
+ "7\tPSR2401746\t45649\t45671\t対応中\n" +
193
+ "8\t-\t45640\t45667\t対応中\n" +
194
+ "9\t-\t45635\t45667\t対応中\n" +
195
+ "10\tPSR2401605\t45623\t45667\t対応中\n" +
196
+ "11\t-\t45664\t45666\t対応中\n" +
197
+ "12\t-\t45664\t45666\t対応中\n" +
198
+ "13\tPSR2401770\t45652\t45666\t対応中\n" +
199
+ "14\t-\t45645\t45665\t対応中\n" +
200
+ "15\tPSR2401609\t45624\t45666\t対応中\n",
201
+ ],
202
+ [
203
+ 2,
204
+ "案件",
205
+ "No\t案件番号\t開始日\t対応完了時期想定\n" +
206
+ "1\tPSR2401244\t45561.40347222222\t45744\n" +
207
+ "2\tPSR2401592\t45621.598611111112\t45698\n" +
208
+ "3\tPSR2401682\t45638.40902777778\t45688\n" +
209
+ "4\tPSR2401706\t45643.383333333331\t45671\n" +
210
+ "5\tPSR2401779\t45653.490277777775\t45671\n" +
211
+ "6\tPSR2401805\t45664.436805555553\t調整中\n" +
212
+ "7\tPSR2400677\t45455.588194444441\t45651\t完了\n" +
213
+ "8\tPSR2401666\t45636.405555555553\t45653\t完了\n" +
214
+ "9\tPSR2401714\t45644.630555555559\t45652\t完了\n",
215
+ ],
216
+ [
217
+ 3,
218
+ "障害恒久対応・改善対応",
219
+ "No\t案件番号\t分類\t開始日\t対応完了時期想定\n" +
220
+ "1\tPSR2401334\t改善対応\t45576.411805555559\t45688\n" +
221
+ "2\tPSR2401335\t改善対応\t45576.415277777778\t45688\n" +
222
+ "3\tPSR2401410\t改善対応\t45588.428472222222\t調整中\n" +
223
+ "4\tPSR2401411\t改善対応\t45588.432638888888\t調整中\n" +
224
+ "5\tPSR2401718\t障害恒久対応\t45645.386111111111\t45695\n" +
225
+ "6\tPSR2401807\t障害恒久対応\t45664.546527777777\t調整中\t1/16リリースで調整中\n",
226
+ ],
227
+ ],
228
+ actual)
229
+ end
230
+
156
231
  sub_test_case("invalid") do
157
232
  def test_empty
158
233
  messages = capture_log do
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chupa-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.4
4
+ version: 1.3.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sutou Kouhei
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2024-09-22 00:00:00.000000000 Z
10
+ date: 2025-01-10 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: archive-zip
@@ -189,6 +189,7 @@ files:
189
189
  - test/fixture/tar/top-level.tar
190
190
  - test/fixture/tar/utf-8.tar
191
191
  - test/fixture/xlsx/attributes.xlsx
192
+ - test/fixture/xlsx/complex-shared-strings.xlsx
192
193
  - test/fixture/xlsx/empty.xlsx
193
194
  - test/fixture/xlsx/multi-sheets.xlsx
194
195
  - test/fixture/xlsx/not-shared-cell.xlsx
@@ -232,7 +233,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
232
233
  - !ruby/object:Gem::Version
233
234
  version: '0'
234
235
  requirements: []
235
- rubygems_version: 3.6.0.dev
236
+ rubygems_version: 3.6.2
236
237
  specification_version: 4
237
238
  summary: ChupaText is an extensible text extractor. You can plug your custom text
238
239
  extractor in ChupaText. You can write your plugin by Ruby.