simple_text_extract 3.0.8 → 3.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 44412a154f56b1100983582f15f3d56800edf040dfac98a5ae9fb9b3fa2d6401
4
- data.tar.gz: a6c5b2d94e13be12c71d3b4a66d0c2aef8fc6b7d7084a4c2b2d490e49954916d
3
+ metadata.gz: b19c5d7467780a0ccb69fb18e29eb81c9acb7e16caf39a93ddeedaf325d541bd
4
+ data.tar.gz: 942c5a13853805d7e66feacf997e5a9181e69ea309a880916ca08653c0e3e8bd
5
5
  SHA512:
6
- metadata.gz: 88e630c8aeee64240196e96c93675ba15a1a7c2f3c2c1a67ab51a05d35dc404a11574f61995628b2b399cd4d42570d40beddb55c0b6cda0bccece97415d995a7
7
- data.tar.gz: 42ff8b4a4702c52702ac95b2bab6b79495384cdec6d9f68011fb04bfca778cf91442c010ab67ac0a1ee59d27148b00be43221fbd20392d6e96f53d4ec5ae7788
6
+ metadata.gz: 42303612565dbfe9f23f699c3e554fb9509bbf7165e24d5b4b1fbea3f0ffe0bcd9f8723ac39861341f978b734ab671d17f6aa5a530e6fcf47dddfbe6e796acd1
7
+ data.tar.gz: c07deca30b24a70fef2baf7967bcfb411d4a60bdd24c624ce0a328e83be4ed558dca2628f468fa27b9d66e5e317413539d15d7b1bb551a8ec987d80eb69723a6
data/Gemfile.lock CHANGED
@@ -1,7 +1,9 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- simple_text_extract (3.0.8)
4
+ simple_text_extract (3.0.9)
5
+ base64
6
+ csv
5
7
  roo (~> 2.10.0)
6
8
  rubyzip (~> 2.3.2)
7
9
  spreadsheet (~> 1.3.0)
@@ -10,61 +12,62 @@ GEM
10
12
  remote: https://rubygems.org/
11
13
  specs:
12
14
  ast (2.4.2)
13
- bigdecimal (3.1.8)
15
+ base64 (0.2.0)
16
+ bigdecimal (3.1.9)
14
17
  coderay (1.1.3)
15
- json (2.7.2)
18
+ csv (3.3.2)
19
+ json (2.9.1)
16
20
  language_server-protocol (3.17.0.3)
17
- memory_profiler (1.0.2)
21
+ memory_profiler (1.1.0)
18
22
  method_source (1.1.0)
19
- minitest (5.24.1)
20
- mocha (2.4.0)
23
+ minitest (5.25.4)
24
+ mocha (2.7.1)
21
25
  ruby2_keywords (>= 0.0.5)
22
- nokogiri (1.16.6-arm64-darwin)
26
+ nokogiri (1.18.1-arm64-darwin)
23
27
  racc (~> 1.4)
24
- nokogiri (1.16.6-x86_64-linux)
28
+ nokogiri (1.18.1-x86_64-linux-gnu)
25
29
  racc (~> 1.4)
26
- parallel (1.25.1)
27
- parser (3.3.4.0)
30
+ parallel (1.26.3)
31
+ parser (3.3.6.0)
28
32
  ast (~> 2.4.1)
29
33
  racc
30
- pry (0.14.2)
34
+ pry (0.15.2)
31
35
  coderay (~> 1.1)
32
36
  method_source (~> 1.0)
33
- racc (1.8.0)
37
+ racc (1.8.1)
34
38
  rainbow (3.1.1)
35
39
  rake (13.2.1)
36
- regexp_parser (2.9.2)
37
- rexml (3.3.1)
38
- strscan
40
+ regexp_parser (2.10.0)
39
41
  roo (2.10.1)
40
42
  nokogiri (~> 1)
41
43
  rubyzip (>= 1.3.0, < 3.0.0)
42
- rubocop (1.65.0)
44
+ rubocop (1.69.2)
43
45
  json (~> 2.3)
44
46
  language_server-protocol (>= 3.17.0)
45
47
  parallel (~> 1.10)
46
48
  parser (>= 3.3.0.2)
47
49
  rainbow (>= 2.2.2, < 4.0)
48
- regexp_parser (>= 2.4, < 3.0)
49
- rexml (>= 3.2.5, < 4.0)
50
- rubocop-ast (>= 1.31.1, < 2.0)
50
+ regexp_parser (>= 2.9.3, < 3.0)
51
+ rubocop-ast (>= 1.36.2, < 2.0)
51
52
  ruby-progressbar (~> 1.7)
52
- unicode-display_width (>= 2.4.0, < 3.0)
53
- rubocop-ast (1.31.3)
53
+ unicode-display_width (>= 2.4.0, < 4.0)
54
+ rubocop-ast (1.37.0)
54
55
  parser (>= 3.3.1.0)
55
56
  ruby-ole (1.2.13.1)
56
57
  ruby-progressbar (1.13.0)
57
58
  ruby2_keywords (0.0.5)
58
59
  rubyzip (2.3.2)
59
- spreadsheet (1.3.1)
60
+ spreadsheet (1.3.3)
60
61
  bigdecimal
61
62
  ruby-ole
62
- strscan (3.1.0)
63
- unicode-display_width (2.5.0)
63
+ unicode-display_width (3.1.3)
64
+ unicode-emoji (~> 4.0, >= 4.0.4)
65
+ unicode-emoji (4.0.4)
64
66
 
65
67
  PLATFORMS
66
68
  arm64-darwin-21
67
69
  arm64-darwin-23
70
+ arm64-darwin-24
68
71
  x86_64-linux
69
72
 
70
73
  DEPENDENCIES
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "open3"
4
+
3
5
  class SimpleTextExtract::Extract # rubocop:disable Metrics/ClassLength
4
6
  def self.formatter(path)
5
7
  case path
@@ -78,7 +80,13 @@ class SimpleTextExtract::Extract # rubocop:disable Metrics/ClassLength
78
80
  def pdf_extract
79
81
  return nil if SimpleTextExtract.missing_dependency?("pdftotext")
80
82
 
81
- `pdftotext #{Shellwords.escape(file.path)} - 2>/dev/null`
83
+ stdout, stderr, status = Open3.capture3("pdftotext #{Shellwords.escape(file.path)} -")
84
+ if status.success?
85
+ stdout
86
+ else
87
+ warn "pdftotext failed: #{stderr}"
88
+ nil
89
+ end
82
90
  end
83
91
 
84
92
  def xlsx_extract
@@ -117,7 +125,13 @@ class SimpleTextExtract::Extract # rubocop:disable Metrics/ClassLength
117
125
  def doc_extract
118
126
  return nil if SimpleTextExtract.missing_dependency?("antiword")
119
127
 
120
- `antiword #{Shellwords.escape(file.path)}`
128
+ stdout, stderr, status = Open3.capture3("antiword #{Shellwords.escape(file.path)}")
129
+ if status.success?
130
+ stdout
131
+ else
132
+ warn "antiword failed: #{stderr}"
133
+ nil
134
+ end
121
135
  end
122
136
 
123
137
  def docx_extract
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SimpleTextExtract
4
- VERSION = "3.0.8"
4
+ VERSION = "3.0.9"
5
5
  end
@@ -29,4 +29,6 @@ Gem::Specification.new do |spec|
29
29
  spec.add_dependency "roo", "~> 2.10.0"
30
30
  spec.add_dependency "rubyzip", "~> 2.3.2"
31
31
  spec.add_dependency "spreadsheet", "~> 1.3.0"
32
+ spec.add_dependency "base64"
33
+ spec.add_dependency "csv"
32
34
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simple_text_extract
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.8
4
+ version: 3.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nick Weiland
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-07-12 00:00:00.000000000 Z
11
+ date: 2024-12-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: roo
@@ -52,6 +52,34 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: 1.3.0
55
+ - !ruby/object:Gem::Dependency
56
+ name: base64
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: csv
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
55
83
  description: Extract text from various file types before resorting to an OCR solution.
56
84
  email:
57
85
  - nickweiland@gmail.com
@@ -97,7 +125,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
97
125
  requirements:
98
126
  - antiword
99
127
  - pdftotext/poppler
100
- rubygems_version: 3.5.15
128
+ rubygems_version: 3.5.21
101
129
  signing_key:
102
130
  specification_version: 4
103
131
  summary: Extract text from various file types before resorting to an OCR solution.