textutils 0.7.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Manifest.txt +2 -0
- data/README.markdown +1 -1
- data/Rakefile +1 -1
- data/lib/textutils/helper/hypertext_helper.rb +19 -6
- data/lib/textutils/patterns.rb +66 -0
- data/lib/textutils/sanitizier.rb +72 -0
- data/lib/textutils/utils.rb +7 -1
- data/lib/textutils/version.rb +1 -1
- data/lib/textutils.rb +3 -0
- data/test/test_hypertext_helper.rb +38 -0
- metadata +15 -13
data/Manifest.txt
CHANGED
@@ -16,11 +16,13 @@ lib/textutils/helper/title_helper.rb
|
|
16
16
|
lib/textutils/helper/unicode_helper.rb
|
17
17
|
lib/textutils/helper/value_helper.rb
|
18
18
|
lib/textutils/helper/xml_helper.rb
|
19
|
+
lib/textutils/patterns.rb
|
19
20
|
lib/textutils/reader/code_reader.rb
|
20
21
|
lib/textutils/reader/fixture_reader.rb
|
21
22
|
lib/textutils/reader/hash_reader.rb
|
22
23
|
lib/textutils/reader/line_reader.rb
|
23
24
|
lib/textutils/reader/values_reader.rb
|
25
|
+
lib/textutils/sanitizier.rb
|
24
26
|
lib/textutils/utils.rb
|
25
27
|
lib/textutils/version.rb
|
26
28
|
test/helper.rb
|
data/README.markdown
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
* bugs :: [github.com/rubylibs/textutils/issues](https://github.com/rubylibs/textutils/issues)
|
5
5
|
* gem :: [rubygems.org/gems/textutils](https://rubygems.org/gems/textutils)
|
6
6
|
* rdoc :: [rubydoc.info/gems/textutils](http://rubydoc.info/gems/textutils)
|
7
|
-
* forum :: [
|
7
|
+
* forum :: [ruby-talk@ruby-lang.org](www.ruby-lang.org/en/community/mailing-lists/)
|
8
8
|
|
9
9
|
|
10
10
|
## Filters
|
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ Hoe.spec 'textutils' do
|
|
11
11
|
self.urls = ['https://github.com/rubylibs/textutils']
|
12
12
|
|
13
13
|
self.author = 'Gerald Bauer'
|
14
|
-
self.email = '
|
14
|
+
self.email = 'ruby-talk@ruby-lang.org'
|
15
15
|
|
16
16
|
# switch extension to .markdown for gihub formatting
|
17
17
|
self.readme_file = 'README.markdown'
|
@@ -8,15 +8,28 @@ def strip_tags( ht )
|
|
8
8
|
### to be done
|
9
9
|
## strip markup tags; return plain text; use brute force for now
|
10
10
|
# check at least for presence of required a-z+ tag names
|
11
|
-
|
11
|
+
#
|
12
|
+
# note: make sure we cover h1/h2/h3/h4/h5/h6 tag w/ number!!
|
13
|
+
|
12
14
|
### ht.gsub( /<[^>]+>/, '' ) - old simple
|
13
15
|
|
14
16
|
## todo: add strip comments e.g. <!-- xxxx --> ???
|
15
17
|
## or use new strip_comments( ht )
|
16
18
|
|
17
|
-
|
18
|
-
|
19
|
-
|
19
|
+
|
20
|
+
## note: follow offical xml spec
|
21
|
+
## - allows for first char: (Letter | '_' | ':')
|
22
|
+
## - allows for followup chars: (Letter | Digit | '_' | ':' | '.' | '-')
|
23
|
+
|
24
|
+
tag_name_pattern = "[a-z_:][a-z0-9_:.\\-]*"
|
25
|
+
|
26
|
+
empty_tag_pattern = "<#{tag_name_pattern}\\s*/>"
|
27
|
+
opening_tag_pattern = "<#{tag_name_pattern}(\\s+[^>]*)?>"
|
28
|
+
closing_tag_pattern = "</#{tag_name_pattern}\\s*>"
|
29
|
+
|
30
|
+
ht = ht.gsub( /#{empty_tag_pattern}/i, '' ) # remove xml-style empty tags eg. <br /> or <br/>
|
31
|
+
ht = ht.gsub( /#{opening_tag_pattern}/i, '' ) # opening tag <p>
|
32
|
+
ht = ht.gsub( /#{closing_tag_pattern}/i, '' ) # closing tag e.g. </p>
|
20
33
|
ht
|
21
34
|
end
|
22
35
|
|
@@ -52,8 +65,8 @@ def whitelist( ht, tags, opts={} )
|
|
52
65
|
# -- note: will NOT strip comments for now e.g. <!-- -->
|
53
66
|
ht = strip_tags( ht )
|
54
67
|
|
55
|
-
pp ht # fix: debugging indo - remove
|
56
|
-
|
68
|
+
## pp ht # fix: debugging indo - remove
|
69
|
+
|
57
70
|
############################################
|
58
71
|
# step three - restore whitelisted tags
|
59
72
|
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module TextUtils
|
4
|
+
|
5
|
+
# collection of regex patterns for reuse
|
6
|
+
|
7
|
+
### todo: add a patterns.md page to github ??
|
8
|
+
## - add regexper pics??
|
9
|
+
|
10
|
+
############
|
11
|
+
# about ruby regexps
|
12
|
+
#
|
13
|
+
# try the rubular - Ruby regular expression editor and tester
|
14
|
+
# -> http://rubular.com
|
15
|
+
# code -> ?? by ??
|
16
|
+
#
|
17
|
+
#
|
18
|
+
# Jeff Avallone's Regexper - Shows State-Automata Diagrams
|
19
|
+
# try -> http://regexper.com
|
20
|
+
# code -> https://github.com/javallone/regexper
|
21
|
+
#
|
22
|
+
#
|
23
|
+
# Regular Expressions | The Bastards Book of Ruby by Dan Nguyen
|
24
|
+
# http://ruby.bastardsbook.com/chapters/regexes/
|
25
|
+
#
|
26
|
+
# move to notes regex|patterns on geraldb.github.io ??
|
27
|
+
#
|
28
|
+
|
29
|
+
EMPTY_LINE_PATTERN = '^\s*$'
|
30
|
+
|
31
|
+
#################################
|
32
|
+
### Start of Line Comment Patterns
|
33
|
+
|
34
|
+
COMMENT_LINE_PATTERN = '^\s*#' # e.g. Ruby/Shell style starting w/ # this is a comment
|
35
|
+
|
36
|
+
COMMENT_LINE_HASKELL_PATTERN = '^\s*--' # e.g. Haskell/Ada? style starting w/ --
|
37
|
+
COMMENT_LINE_ALT_PATTERN = COMMENT_LINE_HASKELL_PATTERN
|
38
|
+
|
39
|
+
COMMENT_LINE_TEX_PATTERN = '^\s*%' # e.g. TeX/LaTeX style starting w/ %
|
40
|
+
COMMENT_LINE_ALT_II_PATTERN = COMMENT_LINE_TEX_PATTERN
|
41
|
+
|
42
|
+
#############################
|
43
|
+
### End of Line (EOL) Comment Patterns
|
44
|
+
|
45
|
+
EOL_COMMENT_PATTERN = '\s+#.+$' # fix: use \b word boundry instead of \s - why why not?
|
46
|
+
# why /b - everything but a-z0-9, that is, spaces but also includes umlauts, special chars etc.
|
47
|
+
|
48
|
+
##############
|
49
|
+
## Dates
|
50
|
+
#
|
51
|
+
# some info at www.regular-expressions.info/dates.html
|
52
|
+
|
53
|
+
YYYY_STRICT_19_20_PATTERN = '(?:19|20)\d\d'
|
54
|
+
YYYY_STRICT_20_PATTERN = '20\d\d'
|
55
|
+
|
56
|
+
MM_STRICT_PATTERN = '0[1-9]|1[012]'
|
57
|
+
M_STRICT_PATTERN = '0?[1-9]|1[012]'
|
58
|
+
|
59
|
+
DD_STRICT_PATTERN = '0[1-9]|[12][0-9]|3[01]'
|
60
|
+
D_STRICT_PATTERN = '0?[1-9]|[12][0-9]|3[01]'
|
61
|
+
|
62
|
+
######
|
63
|
+
## Time
|
64
|
+
|
65
|
+
|
66
|
+
end # TextUtils
|
@@ -0,0 +1,72 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module TextUtils
|
4
|
+
|
5
|
+
class Sanitizier
|
6
|
+
|
7
|
+
include LogUtils::Logging
|
8
|
+
|
9
|
+
@@ignore_tags = %w{ head script style }
|
10
|
+
@@inline_tags = %w{ span b i u }
|
11
|
+
@@block_tags = %w{ p div ul ol }
|
12
|
+
|
13
|
+
|
14
|
+
def initialize( ht )
|
15
|
+
@ht = ht # hypertext (html source)
|
16
|
+
end
|
17
|
+
|
18
|
+
def to_plain_text
|
19
|
+
|
20
|
+
ht = @ht
|
21
|
+
ht = handle_ignore_tags( ht )
|
22
|
+
|
23
|
+
## handle_pre_tags ?? - special rule for preformatted (keep whitespace)
|
24
|
+
|
25
|
+
ht = handle_inline_tags( ht )
|
26
|
+
ht = handle_block_tags( ht )
|
27
|
+
ht = handle_other_tags( ht ) # rules for remain/left over tags
|
28
|
+
|
29
|
+
ht = handle_entities( ht )
|
30
|
+
|
31
|
+
ht
|
32
|
+
end
|
33
|
+
|
34
|
+
def handle_entities( ht )
|
35
|
+
## unescape entities
|
36
|
+
# - check if it also works for generic entities like  etc.
|
37
|
+
# or only for > < etc.
|
38
|
+
ht = CGI.unescapeHTML( ht )
|
39
|
+
end
|
40
|
+
|
41
|
+
def tag_regex( tag )
|
42
|
+
# note use non-greedy .*? for content
|
43
|
+
|
44
|
+
/<#{tag}[^>]*>(.*?)<\/#{tag}>/mi
|
45
|
+
end
|
46
|
+
|
47
|
+
def handle_ignore_tags( ht )
|
48
|
+
@@ignore_tags.each do |tag|
|
49
|
+
ht.gsub!( tag_regex(tag), '' )
|
50
|
+
end
|
51
|
+
ht
|
52
|
+
end
|
53
|
+
|
54
|
+
def handle_inline_tags( ht )
|
55
|
+
@@inline_tags.each do |tag|
|
56
|
+
# add a space after
|
57
|
+
ht.gsub!( tag_regex(tag), '\1 ' )
|
58
|
+
end
|
59
|
+
ht
|
60
|
+
end
|
61
|
+
|
62
|
+
def handle_block_tags( ht )
|
63
|
+
@@block_tags.each do |tag|
|
64
|
+
ht.gsub!( tag_regex(tag), "\n\1\n" )
|
65
|
+
end
|
66
|
+
ht
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
end # class Sanitizier
|
71
|
+
|
72
|
+
end # module TextUtils
|
data/lib/textutils/utils.rb
CHANGED
@@ -39,8 +39,14 @@ def find_data_path_from_gemfile_gitref( name )
|
|
39
39
|
|
40
40
|
# escape chars for regex e.g. . becomes \.
|
41
41
|
name_esc = name.gsub( '.', '\.' )
|
42
|
-
name_regex = /\/(#{name_esc}-[a-z0-9]+)|(#{name_esc})\/lib$/ # e.g. /\/(beer\.db-[a-z0-9]+)|(beer\.db)\//
|
43
42
|
|
43
|
+
|
44
|
+
# note:
|
45
|
+
# - hexdigest must be 12 chars e.g. b7d1c9619a54 or similar
|
46
|
+
|
47
|
+
# e.g. match /\/(beer\.db-[a-z0-9]+)|(beer\.db)\//
|
48
|
+
|
49
|
+
name_regex = /\/((#{name_esc}-[a-z0-9]{12})|(#{name_esc}))\/lib$/
|
44
50
|
candidates = []
|
45
51
|
$LOAD_PATH.each do |path|
|
46
52
|
if path =~ name_regex
|
data/lib/textutils/version.rb
CHANGED
data/lib/textutils.rb
CHANGED
@@ -16,6 +16,9 @@ require 'logutils'
|
|
16
16
|
|
17
17
|
require 'textutils/version'
|
18
18
|
|
19
|
+
require 'textutils/patterns' # regex patterns for reuse
|
20
|
+
require 'textutils/sanitizier'
|
21
|
+
|
19
22
|
require 'textutils/filter/code_filter'
|
20
23
|
require 'textutils/filter/comment_filter'
|
21
24
|
require 'textutils/filter/erb_django_filter'
|
@@ -13,6 +13,44 @@ class TestHypertextHelper < MiniTest::Unit::TestCase
|
|
13
13
|
|
14
14
|
include TextUtils::HypertextHelper # lets us use textify, etc.
|
15
15
|
|
16
|
+
def test_strip_tags
|
17
|
+
## empty tags
|
18
|
+
assert_equal '', strip_tags( '<hr />' )
|
19
|
+
assert_equal '', strip_tags( '<hr/>' )
|
20
|
+
assert_equal '', strip_tags( '<my-emtpy/>' )
|
21
|
+
assert_equal '', strip_tags( '<my-emtpy />' )
|
22
|
+
|
23
|
+
assert_equal 'hello', strip_tags( '<h1>hello</h1>' )
|
24
|
+
assert_equal 'hello', strip_tags( '<h2>hello</h2>' )
|
25
|
+
assert_equal 'hello', strip_tags( '<p>hello</p>' )
|
26
|
+
assert_equal 'hello', strip_tags( '<div>hello</div>' )
|
27
|
+
assert_equal 'hello', strip_tags( '<my-header>hello</my-header>' )
|
28
|
+
|
29
|
+
assert_equal 'hello', strip_tags( '<h1 id="test">hello</h1>' )
|
30
|
+
assert_equal 'hello', strip_tags( '<p id="test">hello</p>' )
|
31
|
+
assert_equal 'hello', strip_tags( '<div id="test">hello</div>' )
|
32
|
+
assert_equal 'hello', strip_tags( '<my-header id="test">hello</my-header>' )
|
33
|
+
|
34
|
+
## check case in-sensitive
|
35
|
+
assert_equal '', strip_tags( '<HR />' )
|
36
|
+
assert_equal '', strip_tags( '<hR />' )
|
37
|
+
assert_equal '', strip_tags( '<Hr />' )
|
38
|
+
assert_equal '', strip_tags( '<HR/>' )
|
39
|
+
assert_equal '', strip_tags( '<My-EmTpY/>' )
|
40
|
+
assert_equal '', strip_tags( '<My-EmTpY />' )
|
41
|
+
|
42
|
+
assert_equal 'hello', strip_tags( '<H1>hello</H1>' )
|
43
|
+
assert_equal 'hello', strip_tags( '<H2>hello</h2>' )
|
44
|
+
assert_equal 'hello', strip_tags( '<P>hello</P>' )
|
45
|
+
assert_equal 'hello', strip_tags( '<DiV>hello</dIv>' )
|
46
|
+
assert_equal 'hello', strip_tags( '<mY-hEaDer>hello</MY-HEADER>' )
|
47
|
+
|
48
|
+
assert_equal 'hello', strip_tags( '<H1 ID="test">hello</h1>' )
|
49
|
+
assert_equal 'hello', strip_tags( '<P id="test">hello</p>' )
|
50
|
+
assert_equal 'hello', strip_tags( '<DIV Id="test">hello</dIV>' )
|
51
|
+
assert_equal 'hello', strip_tags( '<MY-HEADER iD="test">hello</mY-hEaDeR>' )
|
52
|
+
end
|
53
|
+
|
16
54
|
|
17
55
|
def test_stylesheet_link_tag
|
18
56
|
hyout = "<link rel='stylesheet' type='text/css' href='hello.css'>"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textutils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-12-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: logutils
|
16
|
-
requirement: &
|
16
|
+
requirement: &21849324 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,31 +21,31 @@ dependencies:
|
|
21
21
|
version: '0.5'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *21849324
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rdoc
|
27
|
-
requirement: &
|
27
|
+
requirement: &21848640 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
31
31
|
- !ruby/object:Gem::Version
|
32
|
-
version: '
|
32
|
+
version: '4.0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *21848640
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: hoe
|
38
|
-
requirement: &
|
38
|
+
requirement: &21848040 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
42
42
|
- !ruby/object:Gem::Version
|
43
|
-
version: '3.
|
43
|
+
version: '3.7'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *21848040
|
47
47
|
description: textutils - Text Filters, Helpers, Readers and More
|
48
|
-
email:
|
48
|
+
email: ruby-talk@ruby-lang.org
|
49
49
|
executables: []
|
50
50
|
extensions: []
|
51
51
|
extra_rdoc_files:
|
@@ -69,11 +69,13 @@ files:
|
|
69
69
|
- lib/textutils/helper/unicode_helper.rb
|
70
70
|
- lib/textutils/helper/value_helper.rb
|
71
71
|
- lib/textutils/helper/xml_helper.rb
|
72
|
+
- lib/textutils/patterns.rb
|
72
73
|
- lib/textutils/reader/code_reader.rb
|
73
74
|
- lib/textutils/reader/fixture_reader.rb
|
74
75
|
- lib/textutils/reader/hash_reader.rb
|
75
76
|
- lib/textutils/reader/line_reader.rb
|
76
77
|
- lib/textutils/reader/values_reader.rb
|
78
|
+
- lib/textutils/sanitizier.rb
|
77
79
|
- lib/textutils/utils.rb
|
78
80
|
- lib/textutils/version.rb
|
79
81
|
- test/helper.rb
|
@@ -104,11 +106,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
104
106
|
version: '0'
|
105
107
|
requirements: []
|
106
108
|
rubyforge_project: textutils
|
107
|
-
rubygems_version: 1.8.
|
109
|
+
rubygems_version: 1.8.16
|
108
110
|
signing_key:
|
109
111
|
specification_version: 3
|
110
112
|
summary: textutils - Text Filters, Helpers, Readers and More
|
111
113
|
test_files:
|
112
|
-
- test/test_unicode_helper.rb
|
113
114
|
- test/test_hypertext_helper.rb
|
114
115
|
- test/test_title_helper.rb
|
116
|
+
- test/test_unicode_helper.rb
|