simple_text_extract 3.0.2 → 3.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 388ec404e856a47441f3bb3e23873d70eff4fabf86d96afcfeb6f7ad324e262d
4
- data.tar.gz: 054f2cdb252b91ae14bdc678af1b75d433bfc6266d68ccace27ce4ec829c6484
3
+ metadata.gz: 1bcf42e8ea86d7b9bb1f14731cbf2219e5d2a3c9bca5288e0441776b9822a835
4
+ data.tar.gz: 3d17a5ec43537ea50c64b8b07237cd21125794084fa964a121267895b5ba5023
5
5
  SHA512:
6
- metadata.gz: 5c5a9d4f02bd00f32495803f60b2552d084f204a39df5c239d29e9be2c04013ae44b865c805c750c3a52a51a5e9b93a358e97d386a351aa16460664e048bac61
7
- data.tar.gz: f7bc4192659abaf6653f58c01b446e11e483dddad2efb91ebf077a88767346f50997d87ba8af5d4ab26e7ac2ed7bf6a4393fecf75b78b1968662a266e251a84e
6
+ metadata.gz: 7f452ddf56c6d464f4d16de10e4c934bd97b78e6adbedb43b163a5511881b81d466f3064e963410957b194420f74c0c6b554356b411f03845b37c417c845244f
7
+ data.tar.gz: be9d14077b31539fe14aec768cea5669958c922f0360eb8125e9289a8df61a36a6a2574b7ea1458eed010cf474ef80d70e7a601083016a0d968e37417c314c30
data/- ADDED
@@ -0,0 +1 @@
1
+ Test
data/CHANGELOG.md CHANGED
@@ -1,4 +1,8 @@
1
- ## 3.0.1 (2023-04-17)
1
+ ## 3.0.3 (2023-04-27)
2
+
3
+ - Add support for extracting text from tables in DOCX files.
4
+
5
+ ## 3.0.2 (2023-04-17)
2
6
 
3
7
  - Coerces filename in `SimpleTextExtract.supports?(filename:)` to string.
4
8
 
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- simple_text_extract (3.0.2)
4
+ simple_text_extract (3.0.4)
5
5
  roo (~> 2.10.0)
6
6
  rubyzip (~> 2.3.2)
7
7
  spreadsheet (~> 1.3.0)
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- class SimpleTextExtract::Extract
3
+ class SimpleTextExtract::Extract # rubocop:disable Metrics/ClassLength
4
4
  def self.formatter(path)
5
5
  case path
6
6
  when /.zip$/i
@@ -28,7 +28,7 @@ class SimpleTextExtract::Extract
28
28
  end
29
29
 
30
30
  def to_s
31
- @to_s ||= extract.to_s.gsub(/[^\S\n]+/, " ").gsub(/\s?\n\s+/, "\n").strip
31
+ @to_s ||= extract.to_s.scrub.gsub(/[^\S\n]+/, " ").gsub(/\s?\n\s+/, "\n").strip
32
32
  end
33
33
 
34
34
  private
@@ -133,6 +133,13 @@ class SimpleTextExtract::Extract
133
133
  doc.xpath("//w:document//w:body/w:p").each do |node|
134
134
  result << node.text
135
135
  end
136
+
137
+ doc.xpath("//w:document//w:body//w:tbl").each do |node|
138
+ node.xpath(".//w:tr").each do |row|
139
+ text = row.xpath("w:tc").map(&:text)
140
+ result << text.join(", ")
141
+ end
142
+ end
136
143
  end
137
144
 
138
145
  result.join("\n")
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SimpleTextExtract
4
- VERSION = "3.0.2"
4
+ VERSION = "3.0.4"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simple_text_extract
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.2
4
+ version: 3.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nick Weiland
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-04-18 00:00:00.000000000 Z
11
+ date: 2023-06-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: roo
@@ -59,6 +59,7 @@ executables: []
59
59
  extensions: []
60
60
  extra_rdoc_files: []
61
61
  files:
62
+ - "-"
62
63
  - ".github/workflows/build.yml"
63
64
  - ".gitignore"
64
65
  - ".rubocop.yml"