act_as_page_extractor 0.6.0 → 0.6.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6fd076b19cd732c20e8aea0fd17f280523ff1b3013778d599757095c1f3e1e88
4
- data.tar.gz: 41832e5eb175e7417b2862a72fa72b063a60a95692f5116b4dab98f5401b37bb
3
+ metadata.gz: d0bd64f8e12d0c7bb3a75893738e30af616e4bcc5b958b18853b35363823b5ef
4
+ data.tar.gz: d87505b025bd924a545e2f6cbd9071958d65f993a234092f65b3cb7e108b16b1
5
5
  SHA512:
6
- metadata.gz: c2e3a77a42c332c7666b9e474b83dbb72527aa1a22833a3e59ecd97afa9145699479ade5dd2f8e6f1e434ebd8a11179179d96553e86297e984cf6808f086d78b
7
- data.tar.gz: 91b844ef793d0724454cb92c92e5600cde2924ca4925d80b2150fb25c7bfed6354b8bd817fde978da2d35928f47ce8f11c36934d739b181aa95dce802e4e0232
6
+ metadata.gz: af0708407f3b4546424e1666926c248cdb9fe0813ede2dd642d099836282d2f608d8d47edcd5cd513cef9b3ead231c192f6b815ec7721eb141b6820f561d0f30
7
+ data.tar.gz: 6a5969118ff6a6141aaaec8989e38670f75a817afa230636ed84f9f2a4e7c1f160ee569664f940e1355cfc74dff56e30370ca993fbb38d2a7139c17f56858acf
data/Gemfile CHANGED
@@ -3,7 +3,7 @@ source 'https://rubygems.org'
3
3
  # Specify your gem's dependencies in total_compressor.gemspec
4
4
  gemspec
5
5
 
6
- gem 'activerecord', '~> 6.0'
6
+ gem 'activerecord', '~> 6'
7
7
 
8
8
  gem 'awesome_print'
9
9
 
data/Gemfile.lock CHANGED
@@ -1,14 +1,14 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- act_as_page_extractor (0.2.3)
5
- activerecord (~> 6.0)
4
+ act_as_page_extractor (0.6.1)
5
+ activerecord (~> 6)
6
6
  awesome_print (~> 1)
7
7
  docsplit (~> 0)
8
8
  filesize (~> 0)
9
- pdf-reader (~> 1.4.0, >= 1.4.0)
9
+ pdf-reader (~> 1, >= 1.4)
10
10
  pdf_utils (~> 0)
11
- prawn (~> 0.7.1)
11
+ prawn (~> 1)
12
12
  total_compressor (~> 0)
13
13
 
14
14
  GEM
@@ -38,6 +38,7 @@ GEM
38
38
  i18n (1.14.5)
39
39
  concurrent-ruby (~> 1.0)
40
40
  minitest (5.25.1)
41
+ pdf-core (0.4.0)
41
42
  pdf-reader (1.4.1)
42
43
  Ascii85 (~> 1.0.0)
43
44
  afm (~> 0.2.1)
@@ -45,13 +46,9 @@ GEM
45
46
  ruby-rc4
46
47
  ttfunk
47
48
  pdf_utils (0.1.0)
48
- prawn (0.7.2)
49
- prawn-core (>= 0.7.2, < 0.8)
50
- prawn-layout (>= 0.7.2, < 0.8)
51
- prawn-security (>= 0.7.1, < 0.8)
52
- prawn-core (0.7.2)
53
- prawn-layout (0.7.2)
54
- prawn-security (0.7.1)
49
+ prawn (1.3.0)
50
+ pdf-core (~> 0.4.0)
51
+ ttfunk (~> 1.4.0)
55
52
  rake (12.3.3)
56
53
  rspec (3.13.0)
57
54
  rspec-core (~> 3.13.0)
@@ -77,7 +74,7 @@ GEM
77
74
  total_compressor (0.1.11)
78
75
  awesome_print (~> 1.1, >= 1.1.0)
79
76
  rubyzip (~> 1.2, >= 1.2.2)
80
- ttfunk (1.7.0)
77
+ ttfunk (1.4.0)
81
78
  tzinfo (2.0.6)
82
79
  concurrent-ruby (~> 1.0)
83
80
  zeitwerk (2.6.17)
@@ -87,16 +84,16 @@ PLATFORMS
87
84
 
88
85
  DEPENDENCIES
89
86
  act_as_page_extractor!
90
- activerecord (~> 6.0)
87
+ activerecord (~> 6)
91
88
  awesome_print
92
- bundler (~> 1.3)
89
+ bundler (~> 1)
93
90
  byebug
94
91
  docsplit
95
92
  filesize
96
93
  pdf-reader
97
94
  pdf_utils
98
95
  prawn
99
- rake (~> 12.3, >= 12.3.3)
96
+ rake (~> 12, >= 12.3.3)
100
97
  rspec
101
98
  simplecov
102
99
  total_compressor
data/README.md CHANGED
@@ -7,102 +7,111 @@ Library for extracting plain text from documents(files) for further processing (
7
7
 
8
8
  Install appropriate tools before using:
9
9
 
10
- sudo apt-get install zlib1g zlib1g-dev zip rar p7zip-full
11
-
10
+ ```sh
11
+ sudo apt-get install zlib1g zlib1g-dev zip rar p7zip-full
12
+ ```
12
13
  Add this line to your application's Gemfile:
13
14
 
14
- gem 'act_as_page_extractor'
15
-
15
+ ```rb
16
+ gem 'act_as_page_extractor'
17
+ bundle
18
+ ```
16
19
  ## Usage
17
20
 
18
- For example, for model Document we need execute:
21
+ For example, for model Document in the Rails framework we need run:
19
22
 
20
- $ bundle
21
- $ rails g act_as_page_extractor:migration Document category_id user_id
23
+ ```sh
24
+ rails g act_as_page_extractor:migration Document category_id user_id
25
+ ```
22
26
 
23
27
  As a result we get two migration files:
24
-
25
- class AddPageExtractorFields < ActiveRecord::Migration
26
- def change
27
- add_column :documents, :page_extraction_state, :string, default: ''
28
- add_column :documents, :page_extraction_pages, :integer, default: 0
29
- add_column :documents, :page_extraction_doctype, :string, default: ''
30
- add_column :documents, :page_extraction_filesize, :string, default: ''
31
- end
32
- end
33
-
34
- class CreateExtractedPages < ActiveRecord::Migration
35
- def change
36
- create_table :extracted_pages do |t|
37
- t.text :page
38
- t.integer :document_id
39
- t.integer :category_id
40
- t.integer :user_id
41
- t.integer :page_number
42
-
43
- t.timestamps null: false
44
- end
45
-
46
- add_index :extracted_pages, :document_id
47
- add_index :extracted_pages, :category_id
48
- add_index :extracted_pages, [:document_id, :category_id]
49
- add_index :extracted_pages, [:document_id, :page_number]
50
- end
28
+ ```rb
29
+ class AddPageExtractorFields < ActiveRecord::Migration
30
+ def change
31
+ add_column :documents, :page_extraction_state, :string, default: ''
32
+ add_column :documents, :page_extraction_pages, :integer, default: 0
33
+ add_column :documents, :page_extraction_doctype, :string, default: ''
34
+ add_column :documents, :page_extraction_filesize, :string, default: ''
35
+ end
36
+ end
37
+
38
+ class CreateExtractedPages < ActiveRecord::Migration
39
+ def change
40
+ create_table :extracted_pages do |t|
41
+ t.text :page
42
+ t.integer :document_id
43
+ t.integer :category_id
44
+ t.integer :user_id
45
+ t.integer :page_number
46
+
47
+ t.timestamps null: false
51
48
  end
52
49
 
50
+ add_index :extracted_pages, :document_id
51
+ add_index :extracted_pages, :category_id
52
+ add_index :extracted_pages, [:document_id, :category_id]
53
+ add_index :extracted_pages, [:document_id, :page_number]
54
+ end
55
+ end
56
+ ```
53
57
 
54
- Model Document must have field which contains path to file(supports [different archive types](https://github.com/phlowerteam/total_compressor) that contains [txt, pdf, doc/x, txt, html, rtf, ...](https://www.exoplatform.com/docs/public/index.jsp?topic=%2FPLF43%2FPLFAdminGuide.Configuration.JODConverter.html))
58
+ Model Document must have field which contains path to file(supports [different archive types](https://github.com/phlowerteam/total_compressor) that contains [txt, pdf, doc/x, txt, html, rtf, ...](https://docs-old.exoplatform.org/public/index.jsp?topic=%2FPLF41%2FPLFAdminGuide.Configuration.JODConverter.html))
55
59
 
56
60
  Add to model next parameters for initializing:
57
61
 
58
- class Document < ActiveRecord::Base
59
- include ActAsPageExtractor
62
+ ```rb
63
+ class Document < ActiveRecord::Base
64
+ include ActAsPageExtractor
60
65
 
61
- act_as_page_extractor options: {
62
- document_class: 'Document',
63
- save_as_pdf: true,
64
- filename: :filename,
65
- document_id: :document_id,
66
- additional_fields: [:category_id, :user_id],
67
- #file_storage: "/full/path/to/documents/storage",
68
- #pdf_storage: "/full/path/to/extracted/pdf/storage"
69
- }
66
+ act_as_page_extractor options: {
67
+ document_class: 'Document',
68
+ save_as_pdf: true,
69
+ filename: :filename,
70
+ document_id: :document_id,
71
+ additional_fields: [:category_id, :user_id],
72
+ #file_storage: "/full/path/to/documents/storage",
73
+ #pdf_storage: "/full/path/to/extracted/pdf/storage"
74
+ }
70
75
 
71
- has_many :extracted_pages, dependent: :destroy
72
- end
76
+ has_many :extracted_pages, dependent: :destroy
77
+ end
78
+ ```
73
79
 
74
80
  Now our instance has few new methods:
75
81
 
76
- document = Document.first
77
- document.page_extract!
78
- document.extracted_pages
79
- document.pdf_path # if option 'save_as_pdf' is 'true'
82
+ ```rb
83
+ document = Document.first
84
+ document.page_extract!
85
+ document.extracted_pages
86
+ document.pdf_path # if option 'save_as_pdf' is 'true'
80
87
 
81
- # Access to pages
82
- ExtractedPage.count
88
+ # Access to pages
89
+ ExtractedPage.count
83
90
 
84
- # Importing whole directory of documents
85
- ActAsPageExtractor.import_files('/path/to/foler/with/documents')
91
+ # Importing whole directory of documents
92
+ ActAsPageExtractor.import_files('/path/to/foler/with/documents')
86
93
 
87
- # We can use cron for run the processing of all the new documents
88
- ActAsPageExtractor.start_extraction
94
+ # We can use cron for run the processing of all the new documents
95
+ ActAsPageExtractor.start_extraction
89
96
 
90
- # Getting statistics information of all documents
91
- ActAsPageExtractor.statistics
97
+ # Getting statistics information of all documents
98
+ ActAsPageExtractor.statistics
99
+ ```
92
100
 
93
- Parameters of initializing `act_as_page_extractor options: { ... }`:
101
+ Parameters of initializing **act_as_page_extractor**:
94
102
 
95
- `document_class` - name of model (e.g. 'Document)
96
- `save_as_pdf` - boolean [true, false] when we want save temporary pdf
97
- `filename` - name of field which contains access to the file and it should be an object with 'url' method that returns path to file (e.g. CarrierWave object with 'filename.url')
98
- `document_id` - name for saving id
99
- `additional_fields` - additional fields that added to extracted page (e.g. for indexing, etc.)
100
- `file_storage` - path for saving tmp files (by default it is "public")
101
- `pdf_storage` - path for saving pdf (by default it is "public/uploads/extracted/pdf")
103
+ * **document_class** - name of model (e.g. Document)
104
+ * **save_as_pdf** - boolean [true, false] when we want save temporary pdf
105
+ * **filename** - name of field which contains access to the file and it should be an object with 'url' method that returns path to file (e.g. CarrierWave object with 'filename.url')
106
+ * **document_id** - name for saving id
107
+ * **additional_fields** - additional fields that added to extracted page (e.g. for indexing, etc.)
108
+ * **file_storage** - path for saving tmp files (by default it is "public")
109
+ * **pdf_storage** - path for saving pdf (by default it is "public/uploads/extracted/pdf")
102
110
 
103
111
  ## Run tests
104
- $ COVERAGE=true rspec
105
-
112
+ ```sh
113
+ rspec
114
+ ```
106
115
  ## Contributing
107
116
  1. Fork it
108
117
  2. Create your feature branch (`git checkout -b my-new-feature`)
@@ -115,5 +124,5 @@ https://github.com/phlowerteam
115
124
  phlowerteam@gmail.com
116
125
 
117
126
  ## License
118
- Copyright (c) 2017 PhlowerTeam
127
+ Copyright (c) 2024 PhlowerTeam
119
128
  MIT License
@@ -10,25 +10,25 @@ Gem::Specification.new do |spec|
10
10
  spec.email = ['phlowerteam@gmail.com']
11
11
  spec.description = %q{Library (Docsplit wrapper) for text extraction from pdf, doc/x, txt files with OpenOffice}
12
12
  spec.summary = %q{Uses system calls}
13
- spec.homepage = 'https://github.com/phlowerteam'
13
+ spec.homepage = 'https://github.com/phlowerteam/act_as_page_extractor'
14
14
  spec.license = 'MIT'
15
15
 
16
16
  spec.files = `git ls-files`.split($/)
17
17
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
18
  spec.require_paths = ['lib']
19
19
 
20
- spec.add_development_dependency 'bundler', '~> 1.3'
21
- spec.add_development_dependency 'rake', '~> 12.3', '>= 12.3.3'
20
+ spec.add_development_dependency 'bundler', '~> 1'
21
+ spec.add_development_dependency 'rake', '~> 12', '>= 12.3.3'
22
22
  spec.add_development_dependency 'byebug', '~> 0'
23
23
  spec.add_development_dependency 'rspec', '~> 0'
24
24
  spec.add_development_dependency 'simplecov', '~> 0'
25
25
 
26
- spec.add_runtime_dependency 'activerecord', '~> 6.0'
26
+ spec.add_runtime_dependency 'activerecord', '~> 6'
27
27
  spec.add_runtime_dependency 'awesome_print', '~> 1'
28
28
  spec.add_runtime_dependency 'docsplit', '~> 0' # API for OpenOffice jodconverter (any to pdf)
29
29
  spec.add_runtime_dependency 'pdf_utils', '~> 0' # getting text from pdf
30
- spec.add_runtime_dependency 'prawn', '~> 0.7.1' # need for pdf_utils
31
- spec.add_runtime_dependency 'pdf-reader', '~> 1.4.0', '>= 1.4.0' # need for pdf_utils
30
+ spec.add_runtime_dependency 'prawn', '~> 1' # need for pdf_utils
31
+ spec.add_runtime_dependency 'pdf-reader', '~> 1', '>= 1.4' # need for pdf_utils
32
32
  spec.add_runtime_dependency 'total_compressor', '~> 0' # decompressing files
33
33
  spec.add_runtime_dependency 'filesize', '~> 0' # pretty size of file
34
34
  end
@@ -0,0 +1,14 @@
1
+ # GEM publishing
2
+
3
+ ```sh
4
+ # Add features or fix bugs
5
+ # Increase version number x.y.z
6
+ # lib/act_as_page_extractor/version.rb
7
+ bundle update
8
+ rspec
9
+ # git commit & git push
10
+
11
+ gem build act_as_page_extractor.gemspec
12
+ gem install ./act_as_page_extractor-x.y.z.gem
13
+ gem push act_as_page_extractor-x.y.z.gem
14
+ ```
@@ -1,17 +1,17 @@
1
1
  require 'timeout'
2
2
 
3
3
  module ActAsPageExtractor
4
+ # :nocov:
4
5
  def timeout_wrapper
5
6
  result = nil
6
7
  begin
7
8
  result = Timeout::timeout(60*5) { yield }
8
9
  rescue
9
- # :nocov:
10
10
  ensure
11
- # :nocov:
12
11
  result
13
12
  end
14
13
  end
14
+ # :nocov:
15
15
 
16
16
  def is_extracted
17
17
  @pdf_pages.to_i > 0 && self.extracted_pages.count == @pdf_pages
@@ -46,11 +46,11 @@ module ActAsPageExtractor
46
46
  # ap "@copy_document_path"
47
47
  # ap @copy_document_path
48
48
  # ap "@document_path"
49
- ap @document_path
49
+ # ap @document_path
50
50
  # ap "@pdf_path"
51
51
  # ap @pdf_path
52
52
  # ap "@pdf_pages"
53
- ap @pdf_pages
53
+ # ap @pdf_pages
54
54
  end
55
55
  # :nocov:
56
56
  end
@@ -1,5 +1,5 @@
1
1
  # :nocov:
2
2
  module ActAsPageExtractor
3
- VERSION = "0.6.0"
3
+ VERSION = "0.6.2"
4
4
  end
5
5
  # :nocov:
data/spec/spec_helper.rb CHANGED
@@ -1,7 +1,11 @@
1
- if ENV['COVERAGE']
1
+ unless ENV['SKIP_COVERAGE']
2
2
  require 'simplecov'
3
- SimpleCov.start 'rails'
3
+ SimpleCov.start 'rails' do
4
+ add_filter 'vendor'
5
+ end
6
+ SimpleCov.minimum_coverage 100
4
7
  end
8
+
5
9
  require 'rspec'
6
10
  require 'support/models'
7
11
  require 'act_as_page_extractor'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: act_as_page_extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.6.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - PhlowerTeam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-08-30 00:00:00.000000000 Z
11
+ date: 2024-08-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -16,21 +16,21 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.3'
19
+ version: '1'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.3'
26
+ version: '1'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '12.3'
33
+ version: '12'
34
34
  - - ">="
35
35
  - !ruby/object:Gem::Version
36
36
  version: 12.3.3
@@ -40,7 +40,7 @@ dependencies:
40
40
  requirements:
41
41
  - - "~>"
42
42
  - !ruby/object:Gem::Version
43
- version: '12.3'
43
+ version: '12'
44
44
  - - ">="
45
45
  - !ruby/object:Gem::Version
46
46
  version: 12.3.3
@@ -92,14 +92,14 @@ dependencies:
92
92
  requirements:
93
93
  - - "~>"
94
94
  - !ruby/object:Gem::Version
95
- version: '6.0'
95
+ version: '6'
96
96
  type: :runtime
97
97
  prerelease: false
98
98
  version_requirements: !ruby/object:Gem::Requirement
99
99
  requirements:
100
100
  - - "~>"
101
101
  - !ruby/object:Gem::Version
102
- version: '6.0'
102
+ version: '6'
103
103
  - !ruby/object:Gem::Dependency
104
104
  name: awesome_print
105
105
  requirement: !ruby/object:Gem::Requirement
@@ -148,34 +148,34 @@ dependencies:
148
148
  requirements:
149
149
  - - "~>"
150
150
  - !ruby/object:Gem::Version
151
- version: 0.7.1
151
+ version: '1'
152
152
  type: :runtime
153
153
  prerelease: false
154
154
  version_requirements: !ruby/object:Gem::Requirement
155
155
  requirements:
156
156
  - - "~>"
157
157
  - !ruby/object:Gem::Version
158
- version: 0.7.1
158
+ version: '1'
159
159
  - !ruby/object:Gem::Dependency
160
160
  name: pdf-reader
161
161
  requirement: !ruby/object:Gem::Requirement
162
162
  requirements:
163
- - - ">="
164
- - !ruby/object:Gem::Version
165
- version: 1.4.0
166
163
  - - "~>"
167
164
  - !ruby/object:Gem::Version
168
- version: 1.4.0
165
+ version: '1'
166
+ - - ">="
167
+ - !ruby/object:Gem::Version
168
+ version: '1.4'
169
169
  type: :runtime
170
170
  prerelease: false
171
171
  version_requirements: !ruby/object:Gem::Requirement
172
172
  requirements:
173
- - - ">="
174
- - !ruby/object:Gem::Version
175
- version: 1.4.0
176
173
  - - "~>"
177
174
  - !ruby/object:Gem::Version
178
- version: 1.4.0
175
+ version: '1'
176
+ - - ">="
177
+ - !ruby/object:Gem::Version
178
+ version: '1.4'
179
179
  - !ruby/object:Gem::Dependency
180
180
  name: total_compressor
181
181
  requirement: !ruby/object:Gem::Requirement
@@ -221,6 +221,7 @@ files:
221
221
  - README.md
222
222
  - Rakefile
223
223
  - act_as_page_extractor.gemspec
224
+ - docs/publishing.md
224
225
  - lib/act_as_page_extractor.rb
225
226
  - lib/act_as_page_extractor/modules/extracting.rb
226
227
  - lib/act_as_page_extractor/modules/interface.rb
@@ -248,7 +249,7 @@ files:
248
249
  - test/test-doc-3-pages.rtf
249
250
  - test/test-doc-3-pages.txt
250
251
  - test/test-doc-3-pages.wrong
251
- homepage: https://github.com/phlowerteam
252
+ homepage: https://github.com/phlowerteam/act_as_page_extractor
252
253
  licenses:
253
254
  - MIT
254
255
  metadata: {}
@@ -267,7 +268,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
267
268
  - !ruby/object:Gem::Version
268
269
  version: '0'
269
270
  requirements: []
270
- rubygems_version: 3.0.3
271
+ rubygems_version: 3.3.22
271
272
  signing_key:
272
273
  specification_version: 4
273
274
  summary: Uses system calls