textract 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7dfaec842302697fff5fb526bc358045d63aeabb
4
- data.tar.gz: 6bfa752901cf6bf183ea50370100701224a49ba8
3
+ metadata.gz: 2b777157b81de9ecfc3d2aff6c9a88e8413c23fc
4
+ data.tar.gz: ed98b845d1819d2fbb0ab9ac8172ba95b308b227
5
5
  SHA512:
6
- metadata.gz: f667c4d08fbc1658d1ac98ccd5e79eae4871a51e18e425717573903d2c14f3fceb4f9d37068a410dc01c88e58cf4b6264f26ead1708d8ca867bf430daf439a5d
7
- data.tar.gz: cfef813ebb6fb91e37a71c952e06a56cee13520497a198f480d7f090eebc1b598f9f6dbf67bce8de1f0fa9fbee2d7595e8e5842077d14e6b06dca34b84e0c64d
6
+ metadata.gz: 1ba20d4cb668c8da2ee0a53a9585433b98b0c576e67771084345b57393b5a471cc5191db60654fcdfbf335ffc28478f46397460e0c788fb18a53a703d1fda5b9
7
+ data.tar.gz: 15de095a7a95ffba2a8263b2c67a062a42814c4e7da32045b7be53d1dcc421ebdf5141581607bfbdf87de80fa60b8e8130b73d43bdd1d1cfe2b36451553310ad
@@ -1,3 +1,3 @@
1
1
  module Textract
2
- VERSION = "0.0.5"
2
+ VERSION = "0.0.6"
3
3
  end
data/lib/textract.rb CHANGED
@@ -16,7 +16,7 @@ module Textract
16
16
  OpenGraph.new(html)
17
17
  end
18
18
 
19
- def self.get_text_from_description(html, description, selectors)
19
+ def self.smart_extract(html, description, selectors)
20
20
  doc = Nokogiri::HTML html
21
21
  if selectors.nil?
22
22
  article = doc.search('article')
@@ -72,24 +72,28 @@ module Textract
72
72
  attr_reader :md5
73
73
  attr_reader :author
74
74
 
75
- def initialize(url, selectors)
75
+ def initialize(url, selectors, format="markdown")
76
76
  @url = url
77
77
  agent = Mechanize.new
78
78
  agent.user_agent_alias = 'Mac Safari'
79
79
  @html = agent.get(url).content
80
80
  @tags = Textract.get_og_tags(@html)
81
81
 
82
- @article = Textract.get_text_from_description(@html, @tags.description, selectors)
83
- @text = ReverseMarkdown.convert @article.content, unknown_tags: :bypass
82
+ @article = Textract.smart_extract(@html, @tags.description, selectors)
83
+ if @article.content.nil?
84
+ @text = ""
85
+ else
86
+ if format == 'markdown'
87
+ @text = ReverseMarkdown.convert @article.content, unknown_tags: :bypass
88
+ else
89
+ @text = @article.content
90
+ end
91
+ end
84
92
  @md5 = Textract.generate_hash @text
85
93
  @author = @article.author || Textract.get_author(@html)
86
94
  @title = @tags.title || Textract.get_page_title(@html)
87
95
  end
88
96
 
89
- def to_json
90
- to_h.to_json
91
- end
92
-
93
97
  def to_h
94
98
  {
95
99
  url: @url,
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textract
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Pash
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-30 00:00:00.000000000 Z
11
+ date: 2015-04-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: opengraph_parser