textutils 0.7.0 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Manifest.txt +2 -0
- data/README.markdown +1 -1
- data/Rakefile +1 -1
- data/lib/textutils/helper/hypertext_helper.rb +19 -6
- data/lib/textutils/patterns.rb +66 -0
- data/lib/textutils/sanitizier.rb +72 -0
- data/lib/textutils/utils.rb +7 -1
- data/lib/textutils/version.rb +1 -1
- data/lib/textutils.rb +3 -0
- data/test/test_hypertext_helper.rb +38 -0
- metadata +15 -13
data/Manifest.txt
CHANGED
@@ -16,11 +16,13 @@ lib/textutils/helper/title_helper.rb
|
|
16
16
|
lib/textutils/helper/unicode_helper.rb
|
17
17
|
lib/textutils/helper/value_helper.rb
|
18
18
|
lib/textutils/helper/xml_helper.rb
|
19
|
+
lib/textutils/patterns.rb
|
19
20
|
lib/textutils/reader/code_reader.rb
|
20
21
|
lib/textutils/reader/fixture_reader.rb
|
21
22
|
lib/textutils/reader/hash_reader.rb
|
22
23
|
lib/textutils/reader/line_reader.rb
|
23
24
|
lib/textutils/reader/values_reader.rb
|
25
|
+
lib/textutils/sanitizier.rb
|
24
26
|
lib/textutils/utils.rb
|
25
27
|
lib/textutils/version.rb
|
26
28
|
test/helper.rb
|
data/README.markdown
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
* bugs :: [github.com/rubylibs/textutils/issues](https://github.com/rubylibs/textutils/issues)
|
5
5
|
* gem :: [rubygems.org/gems/textutils](https://rubygems.org/gems/textutils)
|
6
6
|
* rdoc :: [rubydoc.info/gems/textutils](http://rubydoc.info/gems/textutils)
|
7
|
-
* forum :: [
|
7
|
+
* forum :: [ruby-talk@ruby-lang.org](www.ruby-lang.org/en/community/mailing-lists/)
|
8
8
|
|
9
9
|
|
10
10
|
## Filters
|
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ Hoe.spec 'textutils' do
|
|
11
11
|
self.urls = ['https://github.com/rubylibs/textutils']
|
12
12
|
|
13
13
|
self.author = 'Gerald Bauer'
|
14
|
-
self.email = '
|
14
|
+
self.email = 'ruby-talk@ruby-lang.org'
|
15
15
|
|
16
16
|
# switch extension to .markdown for gihub formatting
|
17
17
|
self.readme_file = 'README.markdown'
|
@@ -8,15 +8,28 @@ def strip_tags( ht )
|
|
8
8
|
### to be done
|
9
9
|
## strip markup tags; return plain text; use brute force for now
|
10
10
|
# check at least for presence of required a-z+ tag names
|
11
|
-
|
11
|
+
#
|
12
|
+
# note: make sure we cover h1/h2/h3/h4/h5/h6 tag w/ number!!
|
13
|
+
|
12
14
|
### ht.gsub( /<[^>]+>/, '' ) - old simple
|
13
15
|
|
14
16
|
## todo: add strip comments e.g. <!-- xxxx --> ???
|
15
17
|
## or use new strip_comments( ht )
|
16
18
|
|
17
|
-
|
18
|
-
|
19
|
-
|
19
|
+
|
20
|
+
## note: follow offical xml spec
|
21
|
+
## - allows for first char: (Letter | '_' | ':')
|
22
|
+
## - allows for followup chars: (Letter | Digit | '_' | ':' | '.' | '-')
|
23
|
+
|
24
|
+
tag_name_pattern = "[a-z_:][a-z0-9_:.\\-]*"
|
25
|
+
|
26
|
+
empty_tag_pattern = "<#{tag_name_pattern}\\s*/>"
|
27
|
+
opening_tag_pattern = "<#{tag_name_pattern}(\\s+[^>]*)?>"
|
28
|
+
closing_tag_pattern = "</#{tag_name_pattern}\\s*>"
|
29
|
+
|
30
|
+
ht = ht.gsub( /#{empty_tag_pattern}/i, '' ) # remove xml-style empty tags eg. <br /> or <br/>
|
31
|
+
ht = ht.gsub( /#{opening_tag_pattern}/i, '' ) # opening tag <p>
|
32
|
+
ht = ht.gsub( /#{closing_tag_pattern}/i, '' ) # closing tag e.g. </p>
|
20
33
|
ht
|
21
34
|
end
|
22
35
|
|
@@ -52,8 +65,8 @@ def whitelist( ht, tags, opts={} )
|
|
52
65
|
# -- note: will NOT strip comments for now e.g. <!-- -->
|
53
66
|
ht = strip_tags( ht )
|
54
67
|
|
55
|
-
pp ht # fix: debugging indo - remove
|
56
|
-
|
68
|
+
## pp ht # fix: debugging indo - remove
|
69
|
+
|
57
70
|
############################################
|
58
71
|
# step three - restore whitelisted tags
|
59
72
|
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module TextUtils
|
4
|
+
|
5
|
+
# collection of regex patterns for reuse
|
6
|
+
|
7
|
+
### todo: add a patterns.md page to github ??
|
8
|
+
## - add regexper pics??
|
9
|
+
|
10
|
+
############
|
11
|
+
# about ruby regexps
|
12
|
+
#
|
13
|
+
# try the rubular - Ruby regular expression editor and tester
|
14
|
+
# -> http://rubular.com
|
15
|
+
# code -> ?? by ??
|
16
|
+
#
|
17
|
+
#
|
18
|
+
# Jeff Avallone's Regexper - Shows State-Automata Diagrams
|
19
|
+
# try -> http://regexper.com
|
20
|
+
# code -> https://github.com/javallone/regexper
|
21
|
+
#
|
22
|
+
#
|
23
|
+
# Regular Expressions | The Bastards Book of Ruby by Dan Nguyen
|
24
|
+
# http://ruby.bastardsbook.com/chapters/regexes/
|
25
|
+
#
|
26
|
+
# move to notes regex|patterns on geraldb.github.io ??
|
27
|
+
#
|
28
|
+
|
29
|
+
EMPTY_LINE_PATTERN = '^\s*$'
|
30
|
+
|
31
|
+
#################################
|
32
|
+
### Start of Line Comment Patterns
|
33
|
+
|
34
|
+
COMMENT_LINE_PATTERN = '^\s*#' # e.g. Ruby/Shell style starting w/ # this is a comment
|
35
|
+
|
36
|
+
COMMENT_LINE_HASKELL_PATTERN = '^\s*--' # e.g. Haskell/Ada? style starting w/ --
|
37
|
+
COMMENT_LINE_ALT_PATTERN = COMMENT_LINE_HASKELL_PATTERN
|
38
|
+
|
39
|
+
COMMENT_LINE_TEX_PATTERN = '^\s*%' # e.g. TeX/LaTeX style starting w/ %
|
40
|
+
COMMENT_LINE_ALT_II_PATTERN = COMMENT_LINE_TEX_PATTERN
|
41
|
+
|
42
|
+
#############################
|
43
|
+
### End of Line (EOL) Comment Patterns
|
44
|
+
|
45
|
+
EOL_COMMENT_PATTERN = '\s+#.+$' # fix: use \b word boundry instead of \s - why why not?
|
46
|
+
# why /b - everything but a-z0-9, that is, spaces but also includes umlauts, special chars etc.
|
47
|
+
|
48
|
+
##############
|
49
|
+
## Dates
|
50
|
+
#
|
51
|
+
# some info at www.regular-expressions.info/dates.html
|
52
|
+
|
53
|
+
YYYY_STRICT_19_20_PATTERN = '(?:19|20)\d\d'
|
54
|
+
YYYY_STRICT_20_PATTERN = '20\d\d'
|
55
|
+
|
56
|
+
MM_STRICT_PATTERN = '0[1-9]|1[012]'
|
57
|
+
M_STRICT_PATTERN = '0?[1-9]|1[012]'
|
58
|
+
|
59
|
+
DD_STRICT_PATTERN = '0[1-9]|[12][0-9]|3[01]'
|
60
|
+
D_STRICT_PATTERN = '0?[1-9]|[12][0-9]|3[01]'
|
61
|
+
|
62
|
+
######
|
63
|
+
## Time
|
64
|
+
|
65
|
+
|
66
|
+
end # TextUtils
|
@@ -0,0 +1,72 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module TextUtils
|
4
|
+
|
5
|
+
class Sanitizier
|
6
|
+
|
7
|
+
include LogUtils::Logging
|
8
|
+
|
9
|
+
@@ignore_tags = %w{ head script style }
|
10
|
+
@@inline_tags = %w{ span b i u }
|
11
|
+
@@block_tags = %w{ p div ul ol }
|
12
|
+
|
13
|
+
|
14
|
+
def initialize( ht )
|
15
|
+
@ht = ht # hypertext (html source)
|
16
|
+
end
|
17
|
+
|
18
|
+
def to_plain_text
|
19
|
+
|
20
|
+
ht = @ht
|
21
|
+
ht = handle_ignore_tags( ht )
|
22
|
+
|
23
|
+
## handle_pre_tags ?? - special rule for preformatted (keep whitespace)
|
24
|
+
|
25
|
+
ht = handle_inline_tags( ht )
|
26
|
+
ht = handle_block_tags( ht )
|
27
|
+
ht = handle_other_tags( ht ) # rules for remain/left over tags
|
28
|
+
|
29
|
+
ht = handle_entities( ht )
|
30
|
+
|
31
|
+
ht
|
32
|
+
end
|
33
|
+
|
34
|
+
def handle_entities( ht )
|
35
|
+
## unescape entities
|
36
|
+
# - check if it also works for generic entities like  etc.
|
37
|
+
# or only for > < etc.
|
38
|
+
ht = CGI.unescapeHTML( ht )
|
39
|
+
end
|
40
|
+
|
41
|
+
def tag_regex( tag )
|
42
|
+
# note use non-greedy .*? for content
|
43
|
+
|
44
|
+
/<#{tag}[^>]*>(.*?)<\/#{tag}>/mi
|
45
|
+
end
|
46
|
+
|
47
|
+
def handle_ignore_tags( ht )
|
48
|
+
@@ignore_tags.each do |tag|
|
49
|
+
ht.gsub!( tag_regex(tag), '' )
|
50
|
+
end
|
51
|
+
ht
|
52
|
+
end
|
53
|
+
|
54
|
+
def handle_inline_tags( ht )
|
55
|
+
@@inline_tags.each do |tag|
|
56
|
+
# add a space after
|
57
|
+
ht.gsub!( tag_regex(tag), '\1 ' )
|
58
|
+
end
|
59
|
+
ht
|
60
|
+
end
|
61
|
+
|
62
|
+
def handle_block_tags( ht )
|
63
|
+
@@block_tags.each do |tag|
|
64
|
+
ht.gsub!( tag_regex(tag), "\n\1\n" )
|
65
|
+
end
|
66
|
+
ht
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
end # class Sanitizier
|
71
|
+
|
72
|
+
end # module TextUtils
|
data/lib/textutils/utils.rb
CHANGED
@@ -39,8 +39,14 @@ def find_data_path_from_gemfile_gitref( name )
|
|
39
39
|
|
40
40
|
# escape chars for regex e.g. . becomes \.
|
41
41
|
name_esc = name.gsub( '.', '\.' )
|
42
|
-
name_regex = /\/(#{name_esc}-[a-z0-9]+)|(#{name_esc})\/lib$/ # e.g. /\/(beer\.db-[a-z0-9]+)|(beer\.db)\//
|
43
42
|
|
43
|
+
|
44
|
+
# note:
|
45
|
+
# - hexdigest must be 12 chars e.g. b7d1c9619a54 or similar
|
46
|
+
|
47
|
+
# e.g. match /\/(beer\.db-[a-z0-9]+)|(beer\.db)\//
|
48
|
+
|
49
|
+
name_regex = /\/((#{name_esc}-[a-z0-9]{12})|(#{name_esc}))\/lib$/
|
44
50
|
candidates = []
|
45
51
|
$LOAD_PATH.each do |path|
|
46
52
|
if path =~ name_regex
|
data/lib/textutils/version.rb
CHANGED
data/lib/textutils.rb
CHANGED
@@ -16,6 +16,9 @@ require 'logutils'
|
|
16
16
|
|
17
17
|
require 'textutils/version'
|
18
18
|
|
19
|
+
require 'textutils/patterns' # regex patterns for reuse
|
20
|
+
require 'textutils/sanitizier'
|
21
|
+
|
19
22
|
require 'textutils/filter/code_filter'
|
20
23
|
require 'textutils/filter/comment_filter'
|
21
24
|
require 'textutils/filter/erb_django_filter'
|
@@ -13,6 +13,44 @@ class TestHypertextHelper < MiniTest::Unit::TestCase
|
|
13
13
|
|
14
14
|
include TextUtils::HypertextHelper # lets us use textify, etc.
|
15
15
|
|
16
|
+
def test_strip_tags
|
17
|
+
## empty tags
|
18
|
+
assert_equal '', strip_tags( '<hr />' )
|
19
|
+
assert_equal '', strip_tags( '<hr/>' )
|
20
|
+
assert_equal '', strip_tags( '<my-emtpy/>' )
|
21
|
+
assert_equal '', strip_tags( '<my-emtpy />' )
|
22
|
+
|
23
|
+
assert_equal 'hello', strip_tags( '<h1>hello</h1>' )
|
24
|
+
assert_equal 'hello', strip_tags( '<h2>hello</h2>' )
|
25
|
+
assert_equal 'hello', strip_tags( '<p>hello</p>' )
|
26
|
+
assert_equal 'hello', strip_tags( '<div>hello</div>' )
|
27
|
+
assert_equal 'hello', strip_tags( '<my-header>hello</my-header>' )
|
28
|
+
|
29
|
+
assert_equal 'hello', strip_tags( '<h1 id="test">hello</h1>' )
|
30
|
+
assert_equal 'hello', strip_tags( '<p id="test">hello</p>' )
|
31
|
+
assert_equal 'hello', strip_tags( '<div id="test">hello</div>' )
|
32
|
+
assert_equal 'hello', strip_tags( '<my-header id="test">hello</my-header>' )
|
33
|
+
|
34
|
+
## check case in-sensitive
|
35
|
+
assert_equal '', strip_tags( '<HR />' )
|
36
|
+
assert_equal '', strip_tags( '<hR />' )
|
37
|
+
assert_equal '', strip_tags( '<Hr />' )
|
38
|
+
assert_equal '', strip_tags( '<HR/>' )
|
39
|
+
assert_equal '', strip_tags( '<My-EmTpY/>' )
|
40
|
+
assert_equal '', strip_tags( '<My-EmTpY />' )
|
41
|
+
|
42
|
+
assert_equal 'hello', strip_tags( '<H1>hello</H1>' )
|
43
|
+
assert_equal 'hello', strip_tags( '<H2>hello</h2>' )
|
44
|
+
assert_equal 'hello', strip_tags( '<P>hello</P>' )
|
45
|
+
assert_equal 'hello', strip_tags( '<DiV>hello</dIv>' )
|
46
|
+
assert_equal 'hello', strip_tags( '<mY-hEaDer>hello</MY-HEADER>' )
|
47
|
+
|
48
|
+
assert_equal 'hello', strip_tags( '<H1 ID="test">hello</h1>' )
|
49
|
+
assert_equal 'hello', strip_tags( '<P id="test">hello</p>' )
|
50
|
+
assert_equal 'hello', strip_tags( '<DIV Id="test">hello</dIV>' )
|
51
|
+
assert_equal 'hello', strip_tags( '<MY-HEADER iD="test">hello</mY-hEaDeR>' )
|
52
|
+
end
|
53
|
+
|
16
54
|
|
17
55
|
def test_stylesheet_link_tag
|
18
56
|
hyout = "<link rel='stylesheet' type='text/css' href='hello.css'>"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textutils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-12-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: logutils
|
16
|
-
requirement: &
|
16
|
+
requirement: &21849324 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,31 +21,31 @@ dependencies:
|
|
21
21
|
version: '0.5'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *21849324
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rdoc
|
27
|
-
requirement: &
|
27
|
+
requirement: &21848640 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
31
31
|
- !ruby/object:Gem::Version
|
32
|
-
version: '
|
32
|
+
version: '4.0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *21848640
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: hoe
|
38
|
-
requirement: &
|
38
|
+
requirement: &21848040 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
42
42
|
- !ruby/object:Gem::Version
|
43
|
-
version: '3.
|
43
|
+
version: '3.7'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *21848040
|
47
47
|
description: textutils - Text Filters, Helpers, Readers and More
|
48
|
-
email:
|
48
|
+
email: ruby-talk@ruby-lang.org
|
49
49
|
executables: []
|
50
50
|
extensions: []
|
51
51
|
extra_rdoc_files:
|
@@ -69,11 +69,13 @@ files:
|
|
69
69
|
- lib/textutils/helper/unicode_helper.rb
|
70
70
|
- lib/textutils/helper/value_helper.rb
|
71
71
|
- lib/textutils/helper/xml_helper.rb
|
72
|
+
- lib/textutils/patterns.rb
|
72
73
|
- lib/textutils/reader/code_reader.rb
|
73
74
|
- lib/textutils/reader/fixture_reader.rb
|
74
75
|
- lib/textutils/reader/hash_reader.rb
|
75
76
|
- lib/textutils/reader/line_reader.rb
|
76
77
|
- lib/textutils/reader/values_reader.rb
|
78
|
+
- lib/textutils/sanitizier.rb
|
77
79
|
- lib/textutils/utils.rb
|
78
80
|
- lib/textutils/version.rb
|
79
81
|
- test/helper.rb
|
@@ -104,11 +106,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
104
106
|
version: '0'
|
105
107
|
requirements: []
|
106
108
|
rubyforge_project: textutils
|
107
|
-
rubygems_version: 1.8.
|
109
|
+
rubygems_version: 1.8.16
|
108
110
|
signing_key:
|
109
111
|
specification_version: 3
|
110
112
|
summary: textutils - Text Filters, Helpers, Readers and More
|
111
113
|
test_files:
|
112
|
-
- test/test_unicode_helper.rb
|
113
114
|
- test/test_hypertext_helper.rb
|
114
115
|
- test/test_title_helper.rb
|
116
|
+
- test/test_unicode_helper.rb
|