parsefile 0.0.7 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/ocrfile.rb +6 -2
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0b13ab5008debd780cd15627b816d35ffc40fae1
4
- data.tar.gz: 34b49fcc2c705fb4f00cd782133b68ed44d37886
3
+ metadata.gz: 2738907314aac77a653fd655f4a200e2a63b6ed6
4
+ data.tar.gz: 8c780c9b2304699e37f71cdb7f4842918bc61fc9
5
5
  SHA512:
6
- metadata.gz: d08d0c4314107dc2aa413af4cb4144181330fd7b4c545cd6320873ff833e1fa22a33f5e9d2001831dae23bcc17b95df48f5a063849a39d0fed82c1e92209cf61
7
- data.tar.gz: 450304b1c5d2ab8d3a7cc43f8fe71d9aed02c5dab8f3288ba213b1d599351751da552a26e3a04ba11f368bad4262a8703dc1b6c11285df2d891b2795082759e1
6
+ metadata.gz: e1385fdeb0923ebe72068c529341314aabebe1123f161e636762c19ad3325518716c759f9eae5fdd4bc6f12892709a45fe2dc5381e6151e6472677800fb31865
7
+ data.tar.gz: 919fbc77f37f6d40b15726ae9d5f36b622fdbe37d2789f730da61b5dea61d24ec12f359b607b096dbc5f0afd936e30fb68f23ee50efdd6276e3aa3c9273d6ade
data/lib/ocrfile.rb CHANGED
@@ -28,7 +28,7 @@ class OCRFile
28
28
  end
29
29
  end
30
30
  rescue # Detect errors
31
- binding.pry
31
+ #binding.pry
32
32
  end
33
33
 
34
34
  return @text
@@ -36,17 +36,20 @@ class OCRFile
36
36
 
37
37
  # Check if file is pdf
38
38
  def is_pdf?
39
+ puts "determined: is_pdf"
39
40
  file_start = File.open(@path, 'r') { |f| f.read(8)}
40
41
  file_start.match(/\%PDF-\d+\.?\d+/)
41
42
  end
42
43
 
43
44
  # Load text that is already extracted
44
45
  def load_extracted_text(file)
46
+ puts "file exists: load_extracted_text"
45
47
  @text = JSON.parse(File.read(file))["text"]
46
48
  end
47
49
 
48
50
  # Send file to give me text
49
51
  def give_me_text
52
+ puts "using: give_me_text"
50
53
  c = Curl::Easy.new("http://givemetext.okfnlabs.org/tika/tika/form")
51
54
  c.multipart_form_post = true
52
55
  c.http_post(Curl::PostField.file('file', @path))
@@ -56,6 +59,7 @@ class OCRFile
56
59
  end
57
60
 
58
61
  def give_me_text_local
62
+ puts "using: give_me_text_local"
59
63
  c = Curl::Easy.new(@tika + "/tika")
60
64
  # TODO: move this mime filtering to a higher global level
61
65
  mime_magic = MimeMagic.by_path(@path)
@@ -65,7 +69,6 @@ class OCRFile
65
69
  c.http_put(file_data)
66
70
 
67
71
  #binding.pry
68
-
69
72
  @text = c.body_str
70
73
  gotten_text_ok?(@text)
71
74
  end
@@ -77,6 +80,7 @@ class OCRFile
77
80
 
78
81
  # OCR with tesseract
79
82
  def ocr_pdf
83
+ puts "using: ocr_pdf"
80
84
  # Dir_paths
81
85
  base = Dir.pwd+"/"
82
86
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parsefile
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-10-10 00:00:00.000000000 Z
12
+ date: 2017-01-19 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: OCR file and extract metadata using Apache Tika and Tesseract
15
15
  email: shidash@shidash.com