textract 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7dfaec842302697fff5fb526bc358045d63aeabb
4
- data.tar.gz: 6bfa752901cf6bf183ea50370100701224a49ba8
3
+ metadata.gz: 2b777157b81de9ecfc3d2aff6c9a88e8413c23fc
4
+ data.tar.gz: ed98b845d1819d2fbb0ab9ac8172ba95b308b227
5
5
  SHA512:
6
- metadata.gz: f667c4d08fbc1658d1ac98ccd5e79eae4871a51e18e425717573903d2c14f3fceb4f9d37068a410dc01c88e58cf4b6264f26ead1708d8ca867bf430daf439a5d
7
- data.tar.gz: cfef813ebb6fb91e37a71c952e06a56cee13520497a198f480d7f090eebc1b598f9f6dbf67bce8de1f0fa9fbee2d7595e8e5842077d14e6b06dca34b84e0c64d
6
+ metadata.gz: 1ba20d4cb668c8da2ee0a53a9585433b98b0c576e67771084345b57393b5a471cc5191db60654fcdfbf335ffc28478f46397460e0c788fb18a53a703d1fda5b9
7
+ data.tar.gz: 15de095a7a95ffba2a8263b2c67a062a42814c4e7da32045b7be53d1dcc421ebdf5141581607bfbdf87de80fa60b8e8130b73d43bdd1d1cfe2b36451553310ad
@@ -1,3 +1,3 @@
1
1
  module Textract
2
- VERSION = "0.0.5"
2
+ VERSION = "0.0.6"
3
3
  end
data/lib/textract.rb CHANGED
@@ -16,7 +16,7 @@ module Textract
16
16
  OpenGraph.new(html)
17
17
  end
18
18
 
19
- def self.get_text_from_description(html, description, selectors)
19
+ def self.smart_extract(html, description, selectors)
20
20
  doc = Nokogiri::HTML html
21
21
  if selectors.nil?
22
22
  article = doc.search('article')
@@ -72,24 +72,28 @@ module Textract
72
72
  attr_reader :md5
73
73
  attr_reader :author
74
74
 
75
- def initialize(url, selectors)
75
+ def initialize(url, selectors, format="markdown")
76
76
  @url = url
77
77
  agent = Mechanize.new
78
78
  agent.user_agent_alias = 'Mac Safari'
79
79
  @html = agent.get(url).content
80
80
  @tags = Textract.get_og_tags(@html)
81
81
 
82
- @article = Textract.get_text_from_description(@html, @tags.description, selectors)
83
- @text = ReverseMarkdown.convert @article.content, unknown_tags: :bypass
82
+ @article = Textract.smart_extract(@html, @tags.description, selectors)
83
+ if @article.content.nil?
84
+ @text = ""
85
+ else
86
+ if format == 'markdown'
87
+ @text = ReverseMarkdown.convert @article.content, unknown_tags: :bypass
88
+ else
89
+ @text = @article.content
90
+ end
91
+ end
84
92
  @md5 = Textract.generate_hash @text
85
93
  @author = @article.author || Textract.get_author(@html)
86
94
  @title = @tags.title || Textract.get_page_title(@html)
87
95
  end
88
96
 
89
- def to_json
90
- to_h.to_json
91
- end
92
-
93
97
  def to_h
94
98
  {
95
99
  url: @url,
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textract
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Pash
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-30 00:00:00.000000000 Z
11
+ date: 2015-04-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: opengraph_parser